# 데이터 전처리 이해와 실무
## `데이터 정제: 결측 데이터 처리`

### 결측치 다루기(강의 교안)
1. 결측치 제거하기 
- Likewise, Pairwise
2. 결측치 대체하기
- 일정 값 대체, 선형 값 대체

### 실습 내용 요약
- 예제 데이터 내 결측치를 생성하고 결측치 제거 및 결측치 대체 방안에 대한 전반적 실습

## 0. 실습 데이터
- 데이터: Breast Cancer Wisconsin (Diagnostic)

> 데이터 전처리 과정 내 주의사항
- 데이터 전처리 과정 진행 시에는, 원본 데이터 Copy 필수
- Python은 Copy 함수를 사용하지 않으면, 원본 데이터 값을 변경시킴
- 만일 전처리 내역이 변경되는 경우, 데이터 로딩부터 모든 과정을 다시 시작해야 함
- 따라서 원본 데이터와 전처리 과정을 진행할 데이터를 구분하여 작업 수행

## 1. 라이브러리 세팅 및 데이터 로딩

In [4]:
import numpy as np
import pandas as pd

> 이를테면 (파일명.data) 파일인 경우 다음과 같은 방법을 사용
- cancer = pd.read_csv("./data/Breast Cancer Wisconsin (Diagnostic).data", header=None)
- cancer # 컬럼이 없는 데이터 파일임

> 데이터 컬럼명 지정
- cancer.colums = "id", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave points_worst", "symmetry_worst", "fractal_dimension_worst"]

In [3]:
# 데이터 로딩 및 개요 확인
cancer = pd.read_csv("./data/Breast Cancer Wisconsin (Diagnostic).csv", header=None)
cancer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,
1,842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
2,842517,M,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
3,84300903,M,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
4,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,926424,M,21.56,22.39,142,1479,0.111,0.1159,0.2439,0.1389,...,26.4,166.1,2027,0.141,0.2113,0.4107,0.2216,0.206,0.07115,
566,926682,M,20.13,28.25,131.2,1261,0.0978,0.1034,0.144,0.09791,...,38.25,155,1731,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,
567,926954,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,...,34.12,126.7,1124,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,
568,927241,M,20.6,29.33,140.1,1265,0.1178,0.277,0.3514,0.152,...,39.42,184.6,1821,0.165,0.8681,0.9387,0.265,0.4087,0.124,


In [6]:
# 모든 컬럼 이름을 불러오고 각 이름을 "로 감싼 후 ,로 연결
columns_string = '", "'.join(cancer.columns.astype(str))
columns_string = f'"{columns_string}"'

print(columns_string)

"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32"


In [8]:
# 1행을 컬럼명으로 설정
cancer = pd.read_csv("./data/Breast Cancer Wisconsin (Diagnostic).csv", header=0)

# 컬럼명을 출력 (각 이름을 "로 감싸고 ,로 연결)
columns_string = '", "'.join(cancer.columns)
columns_string = f'"{columns_string}"'

print(columns_string)

"id", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave points_worst", "symmetry_worst", "fractal_dimension_worst", "Unnamed: 32"


In [9]:
# ID를 Index화
cancer = cancer.set_index('id')
cancer

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


## 2. 데이터 전처리 준비: 데이터 복사

In [11]:
cancer_data = cancer.copy()

# 데이터 내 결측치 생성
# 실습을 위한 일부 데이터 선택
cancer_data = cancer_data[0:30]
cancer_data = cancer_data[['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean']]

# 결측치 생성
# 6개 record 내 결측치 생성
cancer_data.iloc[2,:] = np.nan # 3행 내 모든 데이터 결측치 생성

cancer_data.iloc[5,0] = np.nan # 6행 내 1열 데이터 결측치 생성
cancer_data.iloc[10,[3,4]] = np.nan # 11행 내 4,5열 데이터 결측치 생성
cancer_data.iloc[12,:2:4] = np.nan # 13행 내 3,4열 데이터 결측치 생성
cancer_data.iloc[15,[0,3]] = np.nan # 16행 내 1열, 4열 데이터 결측치 생성
cancer_data.iloc[24,4] = np.nan # 25행 내 5열 데이터 결측치 생성

cancer_data

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


## 3. 결측치 제거하기
> 결측치 제거 방안
- 1. listwise deletion: 데이터 내 1개 변수 값에서 N/A(결측)이 존재하는 경우, 해당 행 제거
  2. pairwise deletion: 모든 변수가 N/A(결측)이 존재하는 경우, 해당 행 제거
> 결측치 제거 시, 온전한 데이터를 사용한다는 관점은 적용 가능하니 데이터 손실이 발생함

### 3-1. `Listwise`

In [12]:
# 데이터 개요
cancer_data.info()
# 총 6개 record에서 결측치 존재

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       26 non-null     object 
 1   radius_mean     29 non-null     float64
 2   texture_mean    29 non-null     float64
 3   perimeter_mean  27 non-null     float64
 4   area_mean       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


In [13]:
# listwise deletion 수행
# 30개 record 중, 6개 record에서 결측치 존재함
cancer_copy = cancer_data.copy()
cancer_copy = cancer_copy.dropna() # .dropna : listwise 수행하는 함

# 데이터 요약 : 총 30개 record 중, 하나의 결측치라도 보유한 6개 record 삭제
print(cancer_copy.info())

# 데이터 차원 확인
print("데이터 차원:", np.shape(cancer_copy))

<class 'pandas.core.frame.DataFrame'>
Index: 24 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       24 non-null     object 
 1   radius_mean     24 non-null     float64
 2   texture_mean    24 non-null     float64
 3   perimeter_mean  24 non-null     float64
 4   area_mean       24 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.1+ KB
None
데이터 차원: (24, 5)


In [14]:
cancer_copy

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9
84610002,M,15.78,17.89,103.6,781.0
846381,M,15.85,23.95,103.7,782.7


### 3-2. `Pairwise`

In [15]:
# pairwise deletion 수행
# 30개 record 중, 1개 record에서 모든 변수 내 결측치 존재
# 모든 결측치 존재 record만 삭제
cancer_copy = cancer_data.copy()
cancer_copy = cancer_copy.dropna(how='all')

# 데이터 요약: 총 30개 record 중, 1개 record 삭제
print(cancer_copy.info())

# 데이터 차원 확인
print("데이터 차원:", np.shape(cancer_copy))

<class 'pandas.core.frame.DataFrame'>
Index: 29 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       26 non-null     object 
 1   radius_mean     29 non-null     float64
 2   texture_mean    29 non-null     float64
 3   perimeter_mean  27 non-null     float64
 4   area_mean       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB
None
데이터 차원: (29, 5)


In [16]:
cancer_copy

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9
845636,M,16.02,23.24,,


## 4. 결측치 대체하기
> 결측치 대체 방안
- 1. 일정 값 대체: 결측치를 사전 지정 값으로 대체
  2. 선형 값 대체: 선형함수 기반, 앞뒤 record 값을 활용하여 값 대체
> 결측치 대체 시, 가능한 많은 데이터를 사용할 수 있다는 관점에서 유용하나, 실 데이터와의 차이가 존재할 수 있음 

In [17]:
# 결측치 데이터 확인
cancer_copy = cancer_data.copy()
cancer_copy

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


### 4-1. `일정 값 대체`

In [18]:
# 일정 값 대체 수행
# diagnosis 컬럼 내 결측치는 C라는 범주형 값 일괄 대체
cancer_copy['diagnosis'] = cancer_copy['diagnosis'].fillna('C') # .fillna('대체값')
cancer_copy.head(10)

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,C,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,C,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [20]:
# 수치형 컬럼인 radius_mean 컬럼 내 결측치는 65라는 수치의 일정 값으로 대체
cancer_copy['radius_mean'] = cancer_copy['radius_mean'].fillna(65)
cancer_copy.head(10)

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,C,65.0,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,C,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [21]:
# 데이터 개요 확인
cancer_copy.info()
# 대체를 수행한 diagnosis, radius_mean 컬럼과 그 이외 컬럼별 결측치 여부 확인

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       30 non-null     object 
 1   radius_mean     30 non-null     float64
 2   texture_mean    29 non-null     float64
 3   perimeter_mean  27 non-null     float64
 4   area_mean       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


In [22]:
# 일정 값을 지정 값이 아닌, 컬럼의 평균으로 대체 (평균, 중앙, 최소, 최대값 등으로 대체 가능)
# .replace : texture_mean 컬럼 내 결측치를 --> texture_mean 평균 값으로 대체
cancer_copy['texture_mean'] = cancer_copy['texture_mean'].replace(np.nan, cancer_copy['texture_mean'].mean())

## 동일결과 (fillna 함수 활용)
## cancer_copy['texture_mean'] = cancer_copy['texture_mean'].fillna[cancer_copy['texture_mean'].mean())

# 대체된 값과 texture_mean 컬럼의 평균값 비교
# 3번째 record, id 84300903 확인
print(cancer_copy['texture_mean'].mean())
cancer_copy.head(10)

19.583448275862064


Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,C,65.0,19.583448,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,C,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


### 4-2. `선형 값 대체`
- 데이터 앞 뒤 record 값을 기반으로 결측치 대체 (선형보간법)
- 연속형 변수인 경우에만 가능하다. 

In [23]:
# 데이터의 선형관계를 기반 대체
cancer_copy = cancer_data.copy()
cancer_copy.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0


In [24]:
# 선형보간법
cancer_copy = cancer_copy.interpolate()
cancer_copy.head()

  cancer_copy = cancer_copy.interpolate()


Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,15.995,19.075,105.24,856.05
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0


In [25]:
# 실제 선형값으로 대체되었는지 확인
# radius_mean 컬럼의 id 843009303 경우 ---> 평균값 대체 확인
print((cancer_data.iloc[1, 1] + cancer_data.iloc[3,1])/2)

15.995000000000001


- 선형 값 대체의 경우, 데이터의 연속성을 기반으로 연산되므로 신중히 사용