In [93]:
import numpy as np
import pandas as pd

## 1. 데이터 준비

In [94]:
raw_data = {
    'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
    'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
    'age': [42, np.nan, 36, 24, 73],
    'sex': ['m', np.nan, 'f', 'm', 'f'],
    'preTestScore': [4, np.nan, np.nan, 2, 3],
    'postTestScore': [25, np.nan, np.nan, 62, 70]
}

df = pd.DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])

display(df)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## 2. 결측치 확인

### 2-1. 결측치 비율 확인

In [95]:
# len(df)로 나누어 전체 개수에서의 비율을 확인
df.isnull().sum() / len(df)

first_name       0.2
last_name        0.2
age              0.2
sex              0.2
preTestScore     0.4
postTestScore    0.4
dtype: float64

## 3. 결측치가 있는 행 삭제

### 3-1. dropna()

In [96]:
# inplace 옵션이 없으므로 원본 df는 수정되지 않음
df.dropna()
display(df)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [97]:
# 결측치가 제거된 df를 반환
df_no_missing = df.dropna()
display(df_no_missing)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


### 2-3. how 매개변수

all : 모든 값이 NaN일 경우 해당 행 삭제  
any : 하나라도 NaN가 있을 경우 해당 행 삭제

In [98]:
df_cleaned = df.dropna(how='all')
display(df_cleaned)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


### 2-4 열 삭제

axis=1 매개변수로 행이 아닌 열을 삭제

In [99]:
df['location'] = np.nan
display(df)

deleted_df = df.dropna(axis=1, how='all')
display(deleted_df)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


### 2-5. thresh

thresh=1 : 데이터가 1개 이상 있는 행은 남김  
thresh=5 : 데이터가 5개 이상 있는 행은 남김

In [100]:
df.dropna(axis=0, thresh=1)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [101]:
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


## 4. 결측치 대체

### 4-1. fillna()

In [102]:
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


### 4-2. 평균값으로 대체

In [103]:
df["preTestScore"].fillna(df["preTestScore"].mean(), 
						  inplace=True)
display(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["preTestScore"].fillna(df["preTestScore"].mean(),


Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


### 4-3. groupby()

groupby() : 지정한 컬럼에서 종류별로 그룹화  
groupby()[""] : 그룹에서 컬럼을 지정함  
groupby()[""].transform() : 지정된 컬럼을 그룹별로 연산

In [104]:
df.groupby("sex")["postTestScore"].transform("mean")
# 1. 성별별로 Score를 더하여 평균을 냄
# 2. 성별에 따라 평균값을 대입한 Series를 반환

0    43.5
1     NaN
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

### 4-4. groupby() 활용

fillna() 함수에 Series를 매개변수로 지정하면  
NaN에 대입한 Series의 해당 index의 값을 대입함.

In [None]:
df["postTestScore"].fillna(
	df.groupby("sex")["postTestScore"].transform("mean"), 
	inplace=True)

display(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["postTestScore"].fillna(


Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,
