# Pandas 데이터분석 기초 실습

> 엑셀을 사용하지 않고 Pandas 사용하는 이유
- 엑셀은 프로그램을 만들 수가 없음.
- Pandas는 numpy와 같이 사용 가능 (수학적으로 이점)

In [213]:
import pandas as pd

## Series, Dataframe
- create Dataframe



In [214]:
# list
s1 = pd.core.series.Series([1,2,3])
s2 = pd.core.series.Series(['one', 'two', 'three'])

pd.DataFrame(data = dict(num = s1, word = s2))

Unnamed: 0,num,word
0,1,one
1,2,two
2,3,three


In [215]:
# dictionary 담은 list로 Dataframe 만들기
# key의 순서 보장하지 않으므로 columns 순서 지정하면 됨.

dict_list = [
    {'name' : 'John', 'age' : 25, 'job' : 'student'},
    {'name' : 'Nate', 'age' : 30, 'job' : 'teacher'}
]

df1 = pd.DataFrame(dict_list)
df1 = df1[['name', 'age', 'job']]
df1

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,30,teacher


In [216]:
# orderecDict (key의 순서 보장)

from collections import OrderedDict

ordered_dict = OrderedDict(
    [
        ('name', ['John', 'Nate']),
        ('age', [25, 30]),
        ('job', ['student', 'teacher'])  
    ]
)
df2 = pd.DataFrame.from_dict(ordered_dict)
df2

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,30,teacher


In [217]:
# data, header

lst = [
    ['John', 20, 'student'],
    ['Nate', 30, 'teacher']
]
col_name = ['name', 'age', 'job']
df3 = pd.DataFrame.from_records(lst, columns=col_name)
df3

Unnamed: 0,name,age,job
0,John,20,student
1,Nate,30,teacher


## read file
> pd.read_csv
- csv, txt (쉼표(,) 구분 파일)
- tab 구분 파일
- header 없는 경우

In [218]:
# csv 파일
df_csv = pd.read_csv('Pandas_Basic_data/friend_list.csv')
df_csv

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [219]:
# txt 파일
df_txt = pd.read_csv('Pandas_Basic_data/friend_list.txt')
df_txt

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [220]:
# tab 구분
df_tab = pd.read_csv('Pandas_Basic_data/friend_list_tab.txt', delimiter='\t')
df_tab

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [221]:
# header 없는 경우
df_noheader = pd.read_csv('Pandas_Basic_data/friend_list_no_head.csv', header=None)
df_noheader

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [222]:
df_noheader.columns = ['name', 'age', 'job']
df_noheader

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [223]:
df_header = pd.read_csv('Pandas_Basic_data/friend_list_no_head.csv', header = None, 
                        names = ['name', 'age', 'job'])
df_header

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## write file
- 같은 폴더(위치)에 작성(저장).
- None 값은 빈칸으로 들어감.

In [224]:
df = df_header.copy()

In [225]:
# default : index = True, header = True
# index : row id (행 순서)
# header : column name (열 이름)
# na_rep : na (null 값) 대신 입력

df.to_csv('friends.csv', na_rep='-')

## Select, Filter rows or columns

### by row
> [ : ]

> loc[[ , ]]

In [226]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [227]:
# python method
# 첫 번째 index 포함, 마지막 index 포함 x
df[1:3]

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [228]:
# 불연속적인 row 선택
df.loc[[0,2]]

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,30,teacher


### by column condition

In [229]:
df[df.age > 30]

Unnamed: 0,name,age,job
3,Julia,40,dentist
4,Brian,45,manager


In [230]:
df.query('age > 30')

Unnamed: 0,name,age,job
3,Julia,40,dentist
4,Brian,45,manager


In [231]:
df[(df.age > 20) & (df.name == 'Nate')]

Unnamed: 0,name,age,job
2,Nate,30,teacher


### Filter Column
> iloc

> [[ ]]

> .filter
- axis = 0 : 행
- axis = 1 : 열

In [232]:
# 행, 열 filter (선택)
df.iloc[:,0:2]

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [233]:
df.iloc[0:2, 0:2]

Unnamed: 0,name,age
0,John,20
1,Jenny,30


In [234]:
# by column name
df[['name', 'age']]

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [235]:
df.filter(items=['age', 'job'])

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [236]:
# column 중 a가 포함된 column만
# axis = 0 : 행
# axis = 1 : 열
df.filter(like = 'a', axis = 1)

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [237]:
# regex : 정규식
# e로 끝나는 column

df.filter(regex = 'e$', axis = 1)

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


## Drop, delete row or column

### Drop
> .drop
- inplace = True : 원래 dataframe 변경.
- axis = 1 : 열 삭제

In [238]:
df = df_header.copy()

In [239]:
# 원래 dataframe 영향 X
df.drop([1,3])

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,30,teacher
4,Brian,45,manager
5,Chris,25,intern


In [240]:
df.drop([1,3], inplace = True)
df

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,30,teacher
4,Brian,45,manager
5,Chris,25,intern


In [241]:
# index로 삭제
# 행 이름 0, 2를 삭제하는 것이 아니라 0번째, 2번째 row (index) 삭제
df.drop(df.index[[0,2]])

Unnamed: 0,name,age,job
2,Nate,30,teacher
5,Chris,25,intern


In [242]:
# by value
df[df.age >= 25]

Unnamed: 0,name,age,job
2,Nate,30,teacher
4,Brian,45,manager
5,Chris,25,intern


In [243]:
# drop column
df.drop('age', axis = 1)

Unnamed: 0,name,job
0,John,student
2,Nate,teacher
4,Brian,manager
5,Chris,intern


## create, update row or column
> create, update
- [] : 없는 column name을 입력하고 초기화하면 생성됨.
- .append : 다른 dataframe 결합 가능.
- .apply : 함수를 인자로 입력해 return 값을 통해 값 변경 -> 뒤에 더 정리

In [244]:
df = df_header.copy()
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [245]:
# create other column
df['salary'] = 0
df

Unnamed: 0,name,age,job,salary
0,John,20,student,0
1,Jenny,30,developer,0
2,Nate,30,teacher,0
3,Julia,40,dentist,0
4,Brian,45,manager,0
5,Chris,25,intern,0


In [246]:
# 학생은 salary 없다는 걸 표현
# numpy 사용 (np.where)

import numpy as np

df['salary'] = np.where(df['job'] != 'student', 'yes', 'no')
df

Unnamed: 0,name,age,job,salary
0,John,20,student,no
1,Jenny,30,developer,yes
2,Nate,30,teacher,yes
3,Julia,40,dentist,yes
4,Brian,45,manager,yes
5,Chris,25,intern,yes


In [247]:
# 다른 예제
lst = [{'name':'John', 'midterm':95, 'final':85},
       {'name':'Jenny', 'midterm':85, 'final':80},
       {'name':'Nate', 'midterm':30, 'final':10}]
df = pd.DataFrame(lst, columns = ['name', 'midterm', 'final'])
df

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,30,10


In [248]:
# update row
df2 = pd.DataFrame([
    ['Ben', 50, 50]
], columns= ['name', 'midterm', 'final'])
df2

Unnamed: 0,name,midterm,final
0,Ben,50,50


In [249]:
df.append(df2, ignore_index=True)

Unnamed: 0,name,midterm,final
0,John,95,85
1,Jenny,85,80
2,Nate,30,10
3,Ben,50,50


In [250]:
df['total'] = df['midterm'] + df['final']
df

Unnamed: 0,name,midterm,final,total
0,John,95,85,180
1,Jenny,85,80,165
2,Nate,30,10,40


In [251]:
df['average'] = df['total'] / 2
df

Unnamed: 0,name,midterm,final,total,average
0,John,95,85,180,90.0
1,Jenny,85,80,165,82.5
2,Nate,30,10,40,20.0


In [252]:
grades = []

for i in df['average']:
    if i >= 90:
        grades.append('A')
    elif i >= 80:
        grades.append('B')
    else:
        grades.append('F')
        
df['grade'] = grades
df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,A
1,Jenny,85,80,165,82.5,B
2,Nate,30,10,40,20.0,F


In [253]:
def pass_or_fail(row):
    if row != 'F':
        return "Pass"
    else:
        return "Fail"

In [254]:
# .apply : 함수를 인자로 입력해 return 값을 통해 값 변경
df.grade = df.grade.apply(pass_or_fail)
df

Unnamed: 0,name,midterm,final,total,average,grade
0,John,95,85,180,90.0,Pass
1,Jenny,85,80,165,82.5,Pass
2,Nate,30,10,40,20.0,Fail


## concatenate (결합)

- concat
- append

In [255]:
l1 = [{'name': 'John', 'job': "teacher"},
      {'name': 'Nate', 'job': "student"},
      {'name': 'Fred', 'job': "developer"}]

l2 = [{'name': 'Ed', 'job': "dentist"},
      {'name': 'Jack', 'job': "farmer"},
      {'name': 'Ted', 'job': "designer"}]
         
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])

In [256]:
df1

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer


In [257]:
df2

Unnamed: 0,name,job
0,Ed,dentist
1,Jack,farmer
2,Ted,designer


### concat
- axis = 1 : 열로 합침.

In [258]:
# concat
result1 = pd.concat([df1, df2], ignore_index=True)
result1

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer
3,Ed,dentist
4,Jack,farmer
5,Ted,designer


### append

In [259]:
result2 = df1.append(df2, ignore_index=True)
result2

Unnamed: 0,name,job
0,John,teacher
1,Nate,student
2,Fred,developer
3,Ed,dentist
4,Jack,farmer
5,Ted,designer


## feature engineering - feature extraction
- .apply
- .map
- .applymap

### apply

In [260]:
date_list = [
    {'yyyy-mm-dd' : '2000-06-27' },
    {'yyyy-mm-dd' : '2007-10-27'}]
df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2007-10-27


In [261]:
def extract_year(row):
    return row.split('-')[0]

In [262]:
df['year'] = df['yyyy-mm-dd'].apply(extract_year)
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-27,2000
1,2007-10-27,2007


In [263]:
# parameter 
def get_age(year, current_year):
    return current_year - int(year)

In [264]:
df['age'] = df['year'].apply(get_age, current_year = 2018)
df

Unnamed: 0,yyyy-mm-dd,year,age
0,2000-06-27,2000,18
1,2007-10-27,2007,11


In [265]:
def get_introduce(age, prefix, suffix):
    return prefix + str(age) + suffix

In [266]:
df['introduce'] = df['age'].apply(get_introduce, prefix = "I am ", suffix = " years old")
df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2000-06-27,2000,18,I am 18 years old
1,2007-10-27,2007,11,I am 11 years old


In [267]:
# 여러 개 columns
def get_introduce_2(row):
    return "I was born in " + str(row.year) + " my age is " + str(row.age)

In [268]:
df.introduce = df.apply(get_introduce_2, axis = 1)
df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2000-06-27,2000,18,I was born in 2000 my age is 18
1,2007-10-27,2007,11,I was born in 2007 my age is 11


### map

In [269]:
date_list = [
    {'yyyy-mm-dd' : '2000-06-27' },
    {'yyyy-mm-dd' : '2007-10-27'}]
df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])
df

Unnamed: 0,yyyy-mm-dd
0,2000-06-27
1,2007-10-27


In [270]:
def extract_year(row):
    return row.split('-')[0]

In [271]:
df['year'] = df['yyyy-mm-dd'].map(extract_year)
df

Unnamed: 0,yyyy-mm-dd,year
0,2000-06-27,2000
1,2007-10-27,2007


In [272]:
job_list = [{'age': 20, 'job': 'student'},
         {'age': 30, 'job': 'developer'},
         {'age': 30, 'job': 'teacher'}]
df = pd.DataFrame(job_list)
df

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher


In [273]:
# text를 숫자로 바꿀때 유용 (원하는 값으로)
df.job = df.job.map({'student':1, 'developer':2, 'teacher':3})
df

Unnamed: 0,age,job
0,20,1
1,30,2
2,30,3


### Applymap
- 모든 columns 에 적용하고 싶을 때

In [274]:
x_y = [{'x': 5.5, 'y': -5.6},
         {'x': -5.2, 'y': 5.5},
         {'x': -1.6, 'y': -4.5}]
df = pd.DataFrame(x_y)
df

Unnamed: 0,x,y
0,5.5,-5.6
1,-5.2,5.5
2,-1.6,-4.5


In [275]:
import numpy as np

In [276]:
df = df.applymap(np.around)
df

Unnamed: 0,x,y
0,6.0,-6.0
1,-5.0,6.0
2,-2.0,-4.0


## group by

In [277]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"}
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [278]:
# 각 학과마다 몇 명이 존재하는지
groupby_major = df.groupby('major')
groupby_major.groups

{'Computer Science': Int64Index([0, 1, 6, 7], dtype='int64'),
 'Economics': Int64Index([4, 5, 9], dtype='int64'),
 'Physics': Int64Index([2], dtype='int64'),
 'Psychology': Int64Index([3, 8, 10], dtype='int64')}

In [279]:
for name, group in groupby_major:
    print(name + " : " + str(len(group)))
    print(group)
    print()

Computer Science : 4
       name             major     sex
0      John  Computer Science    male
1      Nate  Computer Science    male
6  Jeniffer  Computer Science  female
7    Edward  Computer Science    male

Economics : 3
    name      major     sex
4  Janny  Economics  female
5   Yuna  Economics  female
9  Wendy  Economics  female

Physics : 1
      name    major   sex
2  Abraham  Physics  male

Psychology : 3
     name       major     sex
3   Brian  Psychology    male
8    Zara  Psychology  female
10   Sera  Psychology  female



In [280]:
df_major_cnt = pd.DataFrame({'count' : groupby_major.size()})
df_major_cnt

Unnamed: 0_level_0,count
major,Unnamed: 1_level_1
Computer Science,4
Economics,3
Physics,1
Psychology,3


In [281]:
# major를 colunm으로
df_major_cnt = pd.DataFrame({'count' : groupby_major.size()}).reset_index()
df_major_cnt

Unnamed: 0,major,count
0,Computer Science,4
1,Economics,3
2,Physics,1
3,Psychology,3


In [282]:
# 성별로 group, 몇 명 있는지
groupby_sex = df.groupby('sex')

In [283]:
for name, group in groupby_sex:
    print(name + " : " + str(len(group)))
    print(group)
    print()

female : 6
        name             major     sex
4      Janny         Economics  female
5       Yuna         Economics  female
6   Jeniffer  Computer Science  female
8       Zara        Psychology  female
9      Wendy         Economics  female
10      Sera        Psychology  female

male : 5
      name             major   sex
0     John  Computer Science  male
1     Nate  Computer Science  male
2  Abraham           Physics  male
3    Brian        Psychology  male
7   Edward  Computer Science  male



## drop duplicates (중복 데이터)
- .duplicated()

In [284]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Sera', 'major': "Psychology", 'sex': "female"},
                {'name': 'John', 'major': "Computer Science", 'sex': "male"},
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [285]:
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
dtype: bool

In [286]:
# 중복된 데이터 제거
df.drop_duplicates()

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [287]:
student_list = [{'name': 'John', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Nate', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Abraham', 'major': "Physics", 'sex': "male"},
                {'name': 'Brian', 'major': "Psychology", 'sex': "male"},
                {'name': 'Janny', 'major': "Economics", 'sex': "female"},
                {'name': 'Yuna', 'major': "Economics", 'sex': "female"},
                {'name': 'Jeniffer', 'major': "Computer Science", 'sex': "female"},
                {'name': 'Edward', 'major': "Computer Science", 'sex': "male"},
                {'name': 'Zara', 'major': "Psychology", 'sex': "female"},
                {'name': 'Wendy', 'major': "Economics", 'sex': "female"},
                {'name': 'Nate', 'major': None, 'sex': "male"},
                {'name': 'John', 'major': "Computer Science", 'sex': None},
         ]
df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])
df

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [288]:
# 정확히 일치하는 행이 없어서 다 false
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
dtype: bool

In [289]:
# 이름이 같으면 중복으로 처리
df.duplicated(['name'])

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11     True
dtype: bool

In [290]:
df.drop_duplicates(['name'], keep = 'first')

Unnamed: 0,name,major,sex
0,John,Computer Science,male
1,Nate,Computer Science,male
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female


In [291]:
df.drop_duplicates(['name'], keep = 'last')

Unnamed: 0,name,major,sex
2,Abraham,Physics,male
3,Brian,Psychology,male
4,Janny,Economics,female
5,Yuna,Economics,female
6,Jeniffer,Computer Science,female
7,Edward,Computer Science,male
8,Zara,Psychology,female
9,Wendy,Economics,female
10,Nate,,male
11,John,Computer Science,


## unique

In [292]:
job_list = [{'name': 'John', 'job': "teacher"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Fred', 'job': "teacher"},
                {'name': 'Abraham', 'job': "student"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Janny', 'job': "developer"},
                {'name': 'Nate', 'job': "teacher"},
                {'name': 'Obrian', 'job': "dentist"},
                {'name': 'Yuna', 'job': "teacher"},
                {'name': 'Rob', 'job': "lawyer"},
                {'name': 'Brian', 'job': "student"},
                {'name': 'Matt', 'job': "student"},
                {'name': 'Wendy', 'job': "banker"},
                {'name': 'Edward', 'job': "teacher"},
                {'name': 'Ian', 'job': "teacher"},
                {'name': 'Chris', 'job': "banker"},
                {'name': 'Philip', 'job': "lawyer"},
                {'name': 'Janny', 'job': "basketball player"},
                {'name': 'Gwen', 'job': "teacher"},
                {'name': 'Jessy', 'job': "student"}
         ]
df = pd.DataFrame(job_list, columns = ['name', 'job'])
df

Unnamed: 0,name,job
0,John,teacher
1,Nate,teacher
2,Fred,teacher
3,Abraham,student
4,Brian,student
5,Janny,developer
6,Nate,teacher
7,Obrian,dentist
8,Yuna,teacher
9,Rob,lawyer


In [293]:
df.job.unique()

array(['teacher', 'student', 'developer', 'dentist', 'lawyer', 'banker',
       'basketball player'], dtype=object)

In [294]:
df.job.value_counts()

teacher              8
student              5
banker               2
lawyer               2
dentist              1
basketball player    1
developer            1
Name: job, dtype: int64

## Nan (None)
- .isna()
- .isnull()


In [295]:
school_id_list = [{'name': 'John', 'job': "teacher", 'age': 40},
                {'name': 'Nate', 'job': "teacher", 'age': 35},
                {'name': 'Yuna', 'job': "teacher", 'age': 37},
                {'name': 'Abraham', 'job': "student", 'age': 10},
                {'name': 'Brian', 'job': "student", 'age': 12},
                {'name': 'Janny', 'job': "student", 'age': 11},
                {'name': 'Nate', 'job': "teacher", 'age': None},
                {'name': 'John', 'job': "student", 'age': None}
         ]
df = pd.DataFrame(school_id_list, columns = ['name', 'job', 'age'])
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,
7,John,student,


In [296]:
df.shape

(8, 3)

In [297]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    8 non-null      object 
 1   job     8 non-null      object 
 2   age     6 non-null      float64
dtypes: float64(1), object(2)
memory usage: 320.0+ bytes


In [298]:
df.isna()

Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,True
7,False,False,True


In [299]:
df.isnull()

Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,True
7,False,False,True


In [300]:
# None 값을 중앙값으로 변경
df['age'].fillna(df.groupby('job')['age'].transform('median'), inplace = True)
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,37.0
7,John,student,11.0
