# 19기 KNN 정규세션 과제

## KNN 구현해보기
### 1. Preprocssing / EDA
지금까지 배운 내용을 토대로 해당 데이터에 대해 자유롭게 전처리와 EDA를 진행해주세요.
### 2. KNN 구현 & 파라미터 튜닝
수업 내용 및 실습 자료를 참고하여 KNN을 구현하고 파라미터 튜닝을 하며 결과를 비교해주세요.
### 3. Evaluation
결과에 대한 평가를 진행하고, 나름의 해석을 달아주세요.

**데이터:** [blackfriday | Kaggle](https://www.kaggle.com/llopesolivei/blackfriday)

---

## 0. 데이터 불러오기

In [79]:
import pandas as pd
df = pd.read_csv("blackfriday.csv", index_col = 0)
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1001088,P00046042,F,0-17,10,A,3,0,5,17.0,,2010
1,1004493,P00347742,F,0-17,10,A,1,0,7,,,4483
2,1005302,P00048942,F,0-17,10,A,1,0,1,4.0,,7696
3,1001348,P00145242,F,0-17,10,A,3,0,2,4.0,,16429
4,1001348,P00106742,F,0-17,10,A,3,0,3,5.0,,5780


## 1. PreProcessing: 결측치 처리, 이상치 제거, X/y 구분, 원핫인코딩

In [80]:
#inf, -inf를 nan으로
import numpy as np

df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [81]:
for col in df:
    print(col, df[col].isnull().values.any())

#Product_Category_2, Product_Category_3만 결측치가 있다.

User_ID False
Product_ID False
Gender False
Age False
Occupation False
City_Category False
Stay_In_Current_City_Years False
Marital_Status False
Product_Category_1 False
Product_Category_2 True
Product_Category_3 True
Purchase False


In [82]:
df['Product_Category_2'].fillna((df['Product_Category_2'].mode())[0], inplace = True)
df['Product_Category_3'].fillna((df['Product_Category_3'].mode())[0], inplace = True)

for col in df:
    print(col, df[col].isnull().values.any())


#범주형 데이터이므로 최빈값으로 값을 대체해주었다.
#그 결과 결측치가 없게 되었다.

User_ID False
Product_ID False
Gender False
Age False
Occupation False
City_Category False
Stay_In_Current_City_Years False
Marital_Status False
Product_Category_1 False
Product_Category_2 False
Product_Category_3 False
Purchase False


In [83]:
#다른 모든 데이터를 이용해 Purchase를 예측하는 KNN Regressor 모델을 만들어보겠다.

X = df.drop(columns = ['User_ID', 'Product_ID', 'Purchase'])
X.describe()

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
count,4998.0,4998.0,4998.0,4998.0,4998.0
mean,8.036815,0.421369,5.320128,9.229692,15.0012
std,6.442697,0.493828,3.912281,4.32584,2.689077
min,0.0,0.0,1.0,2.0,3.0
25%,2.0,0.0,1.0,8.0,16.0
50%,7.0,0.0,5.0,8.0,16.0
75%,14.0,1.0,8.0,14.0,16.0
max,20.0,1.0,20.0,18.0,18.0


In [84]:
y_data = df['Purchase']
y_data.describe()
#max가 Q3 + IQR과 큰 차이가 없으므로, 이상치 제거를 생략하겠다.

count     4998.000000
mean      9234.655462
std       4999.225081
min         13.000000
25%       5497.750000
50%       8049.000000
75%      12039.000000
max      23913.000000
Name: Purchase, dtype: float64

In [85]:
for col in X:
    print(X[col].unique())

#Gender, Age, City_Category, Stay_In_Current_City_Years, Product_Category들의 유일원소를 출력했을 때 예외적인 경우가 보이지 않는다.
# 따라서 별도의 처리 없이 바로 원핫인코딩을 하겠다.
# X의 모든 변수들이 범주형이므로, 모든 feature에 대하여 원핫인코딩을 수행해야 한다.

['F' 'M']
['0-17' '18-25' '26-35' '36-45' '46-50' '51-55' '55+']
[10  2  0 19  1  7 12  8  4  9  3 20 15 14 16 17  5  6 11 18 13]
['A' 'B' 'C']
['3' '1' '2' '4+' '0']
[0 1]
[ 5  7  1  2  3  8 16 14 10 19 11 12  4 17 15  6 13 20 18  9]
[17.  8.  4.  5. 15. 12. 11.  2. 14.  3. 18. 16.  6. 13. 10.  9.  7.]
[16.  9.  5. 15.  8. 14. 17.  4. 18. 12. 13.  6. 10. 11.  3.]


In [86]:
X = X.astype(str)

X_data = pd.get_dummies(X)


In [87]:
pd.set_option('display.max_columns', None)
X_data.head()
#X에 대한 원핫 인코딩을 모두 수행하였다.

Unnamed: 0,Gender_F,Gender_M,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,Occupation_0,Occupation_1,Occupation_10,Occupation_11,Occupation_12,Occupation_13,Occupation_14,Occupation_15,Occupation_16,Occupation_17,Occupation_18,Occupation_19,Occupation_2,Occupation_20,Occupation_3,Occupation_4,Occupation_5,Occupation_6,Occupation_7,Occupation_8,Occupation_9,City_Category_A,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+,Marital_Status_0,Marital_Status_1,Product_Category_1_1,Product_Category_1_10,Product_Category_1_11,Product_Category_1_12,Product_Category_1_13,Product_Category_1_14,Product_Category_1_15,Product_Category_1_16,Product_Category_1_17,Product_Category_1_18,Product_Category_1_19,Product_Category_1_2,Product_Category_1_20,Product_Category_1_3,Product_Category_1_4,Product_Category_1_5,Product_Category_1_6,Product_Category_1_7,Product_Category_1_8,Product_Category_1_9,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0,Product_Category_2_2.0,Product_Category_2_3.0,Product_Category_2_4.0,Product_Category_2_5.0,Product_Category_2_6.0,Product_Category_2_7.0,Product_Category_2_8.0,Product_Category_2_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0,Product_Category_3_3.0,Product_Category_3_4.0,Product_Category_3_5.0,Product_Category_3_6.0,Product_Category_3_8.0,Product_Category_3_9.0
0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


## 2. KNN Model 만들기

In [88]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

knn = KNeighborsRegressor()

In [89]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

y_data = y_data.to_numpy().reshape(-1, 1)
scaler.fit(y_data)
y_data = scaler.transform(y_data)

In [90]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 42)

In [91]:
params_1 = {
    "n_neighbors" : [i for i in range(1, 40, 2)],
    "p": [1, 2],
    "weights": ['uniform', 'distance']
}

In [92]:
grid_cv = GridSearchCV(knn, param_grid = params_1, cv = 5)

In [93]:
grid_cv.fit(X_train, Y_train)
grid_cv.best_params_

{'n_neighbors': 19, 'p': 1, 'weights': 'distance'}

In [94]:
knn_searched = KNeighborsRegressor(n_neighbors=19, p=1, weights='distance')
knn_searched.fit(X_train, Y_train)

In [104]:
test_y_pred = knn_searched.predict(X_test)
descaled_test_y_pred = scaler.inverse_transform(test_y_pred)
descaled_y_test = scaler.inverse_transform(Y_test)

In [106]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(descaled_y_test, descaled_test_y_pred))
#MSE 값이 비교적 크게 나온다. 독립변수에 범주형 변수가 많을 때 KNN Regressor의 사용이 적절하지 않을 수 있다.
#그러나 KNNRegressor의 원리 자체는, 결측치를 채워넣을 때 도움이 될 수 있다.

15580483.311560461
