# 패키지 불러오기

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Encoding

Data Encoding : 머신러닝을 하기 위해서 중요하다고 생각하는 column을 컴퓨터에게 학습 시켜야한다.<br>
컴퓨터는 문자를 알아듣지 못하고 숫자만 알아들을 수 있다.<br>

그래서 문자를 숫자로 변환하는 과정이 필요한데, 이게 바로 Data Encoding이고 대표적으로 One-Hot Encoding과 Label Encoding이 있다.

In [18]:
# 데이터 불러오기
data = pd.read_csv('data/example.csv')

## 주제
#### 번호, 나이, 키, 몸무게, 최종학력, 연봉으로 다음기수를 예측해보자

In [19]:
# 데이터 확인
data.head()

Unnamed: 0,번호,나이,키,몸무게,최종학력,연봉,다음기수
0,1,21,170,70,고등학교,3000,O
1,2,24,175,75,대학교,3200,X
2,3,23,180,80,고등학교,3400,X
3,4,22,185,85,대학교,2800,O
4,5,25,160,60,대학원,4000,O


In [20]:
# column 정보확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
번호      8 non-null int64
나이      8 non-null int64
키       8 non-null int64
몸무게     8 non-null int64
최종학력    8 non-null object
연봉      8 non-null int64
다음기수    8 non-null object
dtypes: int64(5), object(2)
memory usage: 576.0+ bytes


- 가정 : 최종학력이 다음기수에 영향을 미칠 것 같아, 컴퓨터가 알아들을 수 있게 숫자로 바꿔서 학습시켜보자

# One Hot Encoding

<img src="img/OneHot Encoding.PNG" style="width:600px;"/>

#### 첫 번째 방법 : get_dummies()함수 사용

In [21]:
# get_dummies()
df = pd.get_dummies(data['최종학력'])
df

Unnamed: 0,고등학교,대학교,대학원
0,1,0,0
1,0,1,0
2,1,0,0
3,0,1,0
4,0,0,1
5,0,1,0
6,0,1,0
7,1,0,0


In [22]:
# 기존 data에 변환한 column 합치기
table = pd.concat([data,df], axis=1)

In [23]:
# 기존 문자 column삭제
table.drop('최종학력', axis=1, inplace=True)

In [24]:
# 완성된 데이터 확인
table

Unnamed: 0,번호,나이,키,몸무게,연봉,다음기수,고등학교,대학교,대학원
0,1,21,170,70,3000,O,1,0,0
1,2,24,175,75,3200,X,0,1,0
2,3,23,180,80,3400,X,1,0,0
3,4,22,185,85,2800,O,0,1,0
4,5,25,160,60,4000,O,0,0,1
5,6,23,155,55,3600,X,0,1,0
6,7,22,158,58,3800,O,0,1,0
7,8,26,166,66,3700,O,1,0,0


#### 두 번째 방법 : get_dummies함수 안에서 파라미터 조정하여 한 번에 처리

함수 구조 : pd.get_dummies(data, columns, drop_first)
- data : 데이터 프레임
- columns : one-hot encoding처리하고 싶은 column
- drop_first : True of False -> 새로 생기는 column중 첫 번째 컬럼 자동삭제(n-1로도 설명 가능하므로 삭제하는 것)

In [25]:
data2 = pd.get_dummies(data, columns=['최종학력'], drop_first=False)
data2

Unnamed: 0,번호,나이,키,몸무게,연봉,다음기수,최종학력_고등학교,최종학력_대학교,최종학력_대학원
0,1,21,170,70,3000,O,1,0,0
1,2,24,175,75,3200,X,0,1,0
2,3,23,180,80,3400,X,1,0,0
3,4,22,185,85,2800,O,0,1,0
4,5,25,160,60,4000,O,0,0,1
5,6,23,155,55,3600,X,0,1,0
6,7,22,158,58,3800,O,0,1,0
7,8,26,166,66,3700,O,1,0,0


#### 세 번째 방법 : sklearn을 활용한 배열적용

In [26]:
# 라이브러리 불러오기
from sklearn.preprocessing import OneHotEncoder

# numpy를 사용해 배열로 변경
temp = np.array(data['최종학력'])

# OneHotEncoder를 넣기 위해 배열의 shape변경
temp = temp.reshape(-1, 1)

# OneHotEncoder 생성
onehot = OneHotEncoder()

# OneHotEncoder 학습시키기
onehot.fit(temp)

# OneHotEncoder 적용시키기
onehot.transform(temp).toarray()


array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

# Label Encoding

<img src="img/Label Encoding2.PNG" style="width:600px;"/>

In [27]:
# 라이브러리 불러오기
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 생성
label = LabelEncoder()

# LabelEncoder 적용
temp2 = data[['최종학력']].apply(label.fit_transform)

In [47]:
data['최종학력']

0    고등학교
1     대학교
2    고등학교
3     대학교
4     대학원
5     대학교
6     대학교
7    고등학교
Name: 최종학력, dtype: object

In [48]:
data[['최종학력']]

Unnamed: 0,최종학력
0,고등학교
1,대학교
2,고등학교
3,대학교
4,대학원
5,대학교
6,대학교
7,고등학교


In [28]:
temp2

Unnamed: 0,최종학력
0,0
1,1
2,0
3,1
4,2
5,1
6,1
7,0


In [29]:
# 최종 데이터 프레임 만들기
data3 = pd.concat([data, temp2], axis=1)

In [30]:
data3

Unnamed: 0,번호,나이,키,몸무게,최종학력,연봉,다음기수,최종학력.1
0,1,21,170,70,고등학교,3000,O,0
1,2,24,175,75,대학교,3200,X,1
2,3,23,180,80,고등학교,3400,X,0
3,4,22,185,85,대학교,2800,O,1
4,5,25,160,60,대학원,4000,O,2
5,6,23,155,55,대학교,3600,X,1
6,7,22,158,58,대학교,3800,O,1
7,8,26,166,66,고등학교,3700,O,0


In [31]:
# 기존 문자 column 삭제
data3.drop('최종학력', axis=1, inplace=True)

# K-NN

머신러닝은 지도학습과 비지도학습으로 나누어지는데, KNN은 지도학습의 대표적인 알고리즘이다.

<img src="img/KNN.PNG" style="width:600px;"/>

K값 : 가장 가깝게 접하는 요소들을 몇개까지 볼것인지 정해주는 것 <br>
위 사진을 예로 들자면 K=3으로 생각하면 A집단 2개, B집단 1개가 별에 대해 최근접 이웃이다. 결과적으로 A집단 2개 > B집단 1개이므로 파란색 별은 A집단이라고 예측하여 분류하게 된다.

In [32]:
# 데이터 불러오기
wine = pd.read_csv('data/wine.csv')

## 주제
#### 와인의 다른 정보들을 바탕으로 quality(품질)를 예측해보자

In [33]:
wine

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


와인의 품질이 3~6사이면 Good, 아니면 Bad라는 정보를 얻었다고 가정

In [34]:
# quality
def get_grade(data):
    if (data >= 3) & (data <= 6):
        return 'Good'
    else:
        return 'Bad'

In [35]:
wine['grade'] = wine['quality'].apply(get_grade)

- 위와 같은 코드(list comprehension으로도 가능)

In [36]:
# 위와 똑같은 코드
['Good' if x <= 6 else 'Bad' for x in wine['quality']]

['Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Bad',
 'Good',
 'Bad',
 'Good',
 'Bad',
 'Good',
 'Good',
 'Bad',
 'Bad',
 'Bad',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Bad',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Bad',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Bad',
 'Bad',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Bad',
 'Good',
 'Bad',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Bad',
 'Good',
 'Bad',
 'Bad',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Bad',
 'Bad',
 'Bad',
 'Good',
 'Good',
 'Bad',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 'Good',
 '

In [37]:
wine

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,grade
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,Good
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,Good
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,Good
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,Good
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,Good
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,Good
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,Good
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,Bad


### 종속변수, 독립변수 분할

In [38]:
# 종속변수 : 예측할 변수
y = wine['grade']
# 독립변수 : 예측에 활용할 변수
X = wine.drop(['grade','quality'], axis=1)

In [39]:
X

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8


### StandardScaling
데이터를 정규분포의 형태로 고르게 바꿔준다

In [41]:
from sklearn.preprocessing import StandardScaler

# scaler 생성
scaler = StandardScaler()

# scaler 학습
scaler.fit(X)

# scaler 적용
temp=scaler.transform(X)

# 최종 데이터프레임 만들기
X_scaled = pd.DataFrame(temp, columns = X.columns)

In [42]:
X_scaled

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,0.172097,-0.081770,0.213280,2.821349,-0.035355,0.569932,0.744565,2.331512,-1.246921,-0.349184,-1.393152
1,-0.657501,0.215896,0.048001,-0.944765,0.147747,-1.253019,-0.149685,-0.009154,0.740029,0.001342,-0.824276
2,1.475751,0.017452,0.543838,0.100282,0.193523,-0.312141,-0.973336,0.358665,0.475102,-0.436816,-0.336667
3,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.011480,-0.787342,-0.499203
4,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.011480,-0.787342,-0.499203
...,...,...,...,...,...,...,...,...,...,...,...
4893,-0.776015,-0.677101,-0.365197,-0.944765,-0.310008,-0.664970,-1.091000,-0.965483,0.541334,0.088973,0.557282
4894,-0.301959,0.414339,0.213280,0.317179,0.056196,1.275590,0.697499,0.291789,-0.253446,-0.261553,-0.743008
4895,-0.420473,-0.379435,-1.191592,-1.023637,-0.218457,-0.312141,-0.643875,-0.497350,-1.313153,-0.261553,-0.905544
4896,-1.605613,0.116674,-0.282557,-1.043355,-1.088192,-0.900190,-0.667408,-1.784717,1.004955,-0.962605,1.857572


### train(학습), test(검증) 분할

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state = 321)

In [46]:
X_train

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
2385,0.883181,-0.776323,2.857745,1.283355,-0.172682,1.216785,0.509236,1.465466,-0.915763,-0.699710,-1.474421
4190,0.172097,-0.577879,-0.282557,-0.984201,-0.264233,-1.253019,-1.773454,-1.396835,0.077712,-1.400763,1.207427
3985,-0.064931,-1.669319,-0.530476,0.435486,-0.081131,0.569932,-0.808605,0.171412,0.077712,-1.050236,-0.499203
1427,0.646153,-0.280214,1.287594,-1.043355,-0.172682,-0.018117,0.415105,-0.778230,-0.385910,0.527131,0.638550
804,0.527639,0.414339,0.048001,0.041129,-0.630437,-0.664970,-0.620342,0.341946,0.210175,-0.261553,-0.580471
...,...,...,...,...,...,...,...,...,...,...,...
168,0.764667,0.116674,-0.199917,0.504499,0.422400,-0.900190,0.297440,0.927112,-0.717068,0.439499,-0.986812
1425,-0.064931,-0.577879,1.287594,-1.082790,0.285074,-0.547361,-0.243816,-1.012297,0.408870,-1.225500,0.719818
3784,-0.420473,-0.280214,-0.530476,2.170659,0.834380,-0.370946,0.038578,1.248118,0.077712,0.001342,-0.336667
2847,-1.250071,-0.776323,0.048001,-1.063073,-0.493110,0.275907,-1.232197,-1.370084,2.064662,-0.612079,0.963623


In [76]:
# 데이터 크기 확인 : 8:2로 쪼개짐
X_train.shape, X_test.shape

((3918, 11), (980, 11))

### K-NN모델 적용

- n_neighbors : k의 개수
- p(거리계산방법) : 1(맨해튼), 2(유클리디안)
- weights(가중치) : uniform(동일), distance(거리기반 가중치)
- n_jobs : -1을 사용하면 현재 컴퓨터내 모든 process 사용

In [77]:
from sklearn.neighbors import KNeighborsClassifier

# K=3의 유클리디안 거리 방법으로 거리기반 가중치를 부여하여 모델 생성
model = KNeighborsClassifier(n_neighbors=3, p=2, weights='distance')
model

KNeighborsClassifier(n_neighbors=3, weights='distance')

In [78]:
# knn 모델 학습
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3, weights='distance')

In [79]:
# 예측한 확률 출력
model.predict_proba(X_test)

array([[0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       ...,
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.66679817, 0.33320183]])

#### 모델 정확도 파악

방법1 : score함수를 사용해서 한 번에

In [80]:
model.score(X_test, y_test)

0.8673469387755102

방법2 : 올바르게 예측한 개수 / 총 개수

In [81]:
# 학습한 모델을 바탕으로 class 예측
y_pred = model.predict(X_test)

In [82]:
# 정확도 출력
np.sum(y_pred == y_test) / len(y_test)

0.8673469387755102