## 결측치 찾기

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'sex' : ['M', 'F', np.nan, 'M', 'F'],
                   'score' : [5, 4, 3, 4, np.nan]})
df

Unnamed: 0,sex,score
0,M,5.0
1,F,4.0
2,,3.0
3,M,4.0
4,F,


In [2]:
df['score'] + 1

0    6.0
1    5.0
2    4.0
3    5.0
4    NaN
Name: score, dtype: float64

In [3]:
pd.isna(df)  # 결측치 확인

Unnamed: 0,sex,score
0,False,False
1,False,False
2,True,False
3,False,False
4,False,True


In [4]:
pd.isna(df).sum()  # 결측치 빈도 확인

sex      1
score    1
dtype: int64

## 결측치 제거하기

In [5]:
df.dropna(subset = 'score')  # score 결측치 제거

Unnamed: 0,sex,score
0,M,5.0
1,F,4.0
2,,3.0
3,M,4.0


In [6]:
df_nomiss = df.dropna(subset = 'score')  # score 결측치 제거된 데이터 만들기
df_nomiss['score'] + 1  # score로 연산

0    6.0
1    5.0
2    4.0
3    5.0
Name: score, dtype: float64

In [7]:
df_nomiss = df.dropna(subset = ['score', 'sex'])  # score, sex 결측치 제거
df_nomiss

Unnamed: 0,sex,score
0,M,5.0
1,F,4.0
3,M,4.0


In [8]:
df_nomiss2 = df.dropna()  # 모든 변수에 결측치 없는 데이터 추출
df_nomiss2

Unnamed: 0,sex,score
0,M,5.0
1,F,4.0
3,M,4.0


In [9]:
df['score'].mean()

4.0

In [10]:
df['score'].sum()

16.0

In [11]:
df.groupby('sex').agg(mean_score = ('score', 'mean'),
                      sum_score  = ('score', 'sum'))

Unnamed: 0_level_0,mean_score,sum_score
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,4.0,4.0
M,4.5,9.0


## 결측치 대체하기

In [12]:
exam = pd.read_csv('exam.csv')
exam.loc[[2, 7, 14], ['math']] = np.nan  # 2, 7, 14행의 math에 NaN 할당
exam

Unnamed: 0,id,nclass,math,english,science
0,1,1,50.0,98,50
1,2,1,60.0,97,60
2,3,1,,86,78
3,4,1,30.0,98,58
4,5,2,25.0,80,65
5,6,2,50.0,89,98
6,7,2,80.0,90,45
7,8,2,,78,25
8,9,3,20.0,98,15
9,10,3,50.0,98,45


In [13]:
exam['math'].mean()

55.23529411764706

In [14]:
exam['math'] = exam['math'].fillna(55)  # math가 NaN이면 55로 대체
exam

Unnamed: 0,id,nclass,math,english,science
0,1,1,50.0,98,50
1,2,1,60.0,97,60
2,3,1,55.0,86,78
3,4,1,30.0,98,58
4,5,2,25.0,80,65
5,6,2,50.0,89,98
6,7,2,80.0,90,45
7,8,2,55.0,78,25
8,9,3,20.0,98,15
9,10,3,50.0,98,45


In [15]:
exam['math'].isna().sum()  # 결측치 빈도 확인

0