# 최근접 이웃(K-Nearest Neighbor)

* 특별한 예측 모델 없이 가장 가까운 데이터 포인트를 기반으로 예측을 수행하는 방법
* 분류와 회귀 모두 지원
* Non-paramatic model

![k nearest neighbor](https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/KnnClassification.svg/220px-KnnClassification.svg.png)

In [None]:
import pandas as pd
import numpy as np
import multiprocessing
import matplotlib.pyplot as plt
plt.style.use(['seaborn-whitegrid'])

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.manifold import TSNE
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.datasets import load_boston, fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline

## K 최근접 이웃 분류

* 입력 데이터 포인트와 가장 가까운 k개의 훈련 데이터 포인트가 출력
* k개의 데이터 포인트 중 가장 많은 클래스가 예측 결과

### 붓꽃 데이터

In [None]:
iris = load_iris()

In [None]:
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df

In [None]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
# scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [None]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

In [None]:
model = KNeighborsClassifier()
model.fit(X_train_scale, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train_scale, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test_scale, y_test)))

In [None]:
cross_validator = cross_validate(model,
                                 X=X, y=y,
                                 cv=5,
                                 n_jobs=multiprocessing.cpu_count(),
                                 verbose=True
                                )
cross_validator

In [None]:
param_grid = [{ 'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance'],
                'algorithm': ['ball_tree', 'kd_tree', 'brute']}]


In [None]:
# from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(
          estimator=KNeighborsClassifier(),
          param_grid=param_grid,
          n_jobs=multiprocessing.cpu_count(),
          verbose=True
          )

In [None]:
gs.fit(X, y)

In [None]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(
            estimator=KNeighborsClassifier(),
            param_grid=param_grid,
            n_jobs=multiprocessing.cpu_count(),
            verbose=True
            )

In [None]:
gs.fit(X, y)

In [None]:
gs.best_estimator_.get_params()

In [None]:
print('GridSearchCV best score:', gs.best_score_)

In [None]:
def make_meshgrid(x, y, h=.02):
    x_min, y_min = x.min()-1, y.min()-1
    x_max, y_max = x.max()+1, y.max()+1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    return xx, yy

def plot_contours(clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = plt.contourf(xx, yy, Z, **params)
    
    return out

In [None]:
tsne = TSNE(n_components=2)
X_comp = tsne.fit_transform(X)



In [None]:
iris_comp_df = pd.DataFrame(data=X_comp)
iris_comp_df['Target'] = y
iris_comp_df

In [None]:
plt.scatter(X_comp[:, 0], X_comp[:, 1],
            c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k');

In [None]:
model =  KNeighborsClassifier()

model.fit(X_comp, y)
predict = model.predict(X_comp)

In [None]:
xx, yy = make_meshgrid(X_comp[:, 0], X_comp[:, 1])
plot_contours(model, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X_comp[:, 0], X_comp[:, 1], c=y, 
            cmap=plt.cm.coolwarm, s=20, edgecolors='k');

### 유방암 데이터

In [None]:
cancer = load_breast_cancer()

In [None]:
cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
cancer_df['target'] = cancer.target
cancer_df

In [None]:
cancer_df.target.value_counts()

In [None]:
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2)


In [None]:
cancer_train_df = pd.DataFrame(data=X_train, columns=cancer.feature_names)
cancer_train_df['target'] = y_train
cancer_train_df

In [None]:
cancer_test_df = pd.DataFrame(data=X_test, columns=cancer.feature_names)
cancer_test_df['target'] = y_test
cancer_test_df

In [None]:
cancer_train_df.target.value_counts()

In [None]:
cancer_test_df.target.value_counts()

In [None]:
scaler = StandardScaler()

X_train_data = scaler.fit_transform(X_train)
X_test_data = scaler.transform(X_test)

## 표준화  안된 자료로 모델구성

In [None]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

## 표준화 후 모델구성

In [None]:
model = KNeighborsClassifier()
model.fit(X_train_data, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train_data, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test_data, y_test)))

In [None]:
estimator = make_pipeline(
            StandardScaler(),
            KNeighborsClassifier())


In [None]:
cross_validate( estimator = estimator,
               X=X, y=y,
               cv=5, 
               n_jobs = multiprocessing.cpu_count(),
               verbose=True)


In [None]:
pipe = Pipeline( 
    [('scaler', StandardScaler()),
     ('model', KNeighborsClassifier())])



In [None]:
param_grid = [{'model__n_neighbors': [3, 5, 7],
               'model__weights': ['uniform', 'distance'],
               'model__algorithm': ['ball_tree', 'kd_tree', 'brute']}]


In [None]:
gs = GridSearchCV(estimator= pipe,
                  param_grid=param_grid,
                  n_jobs=multiprocessing.cpu_count(),
                  verbose=True)

gs.fit(X, y)

In [None]:
gs.best_estimator_.get_params()

In [None]:
print('GridSearchCV best score {}'.format(gs.best_score_))

In [None]:
tsne = TSNE(n_components=2)
X_comp = tsne.fit_transform(X)

In [None]:
cancer_comp_df = pd.DataFrame(data=X_comp)
cancer_comp_df['target'] = y
cancer_comp_df

In [None]:
plt.scatter(X_comp[:, 0], X_comp[:, 1], c=y, cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');

In [None]:
model = KNeighborsClassifier()
model.fit(X_comp, y)
predict = model.predict(X_comp)

In [None]:
xx, yy = make_meshgrid(X_comp[:, 0], X_comp[:, 1])
plot_contours(model, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X_comp[:, 0], X_comp[:, 1], c=y, cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');

### 와인 데이터

In [None]:
wine = load_wine()

In [None]:
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df['target'] = cancer.target
wine_df

In [None]:
wine_df.target.value_counts()

In [None]:
X, y = wine.data, wine.target
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2)


In [None]:
wine_train_df = pd.DataFrame(data=X_train, columns=wine.feature_names)
wine_train_df['target'] = y_train
wine_train_df

In [None]:
wine_test_df = pd.DataFrame(data=X_test, columns=cancer.feature_names)
wine_test_df['target'] = y_test
wine_test_df

In [None]:
wine_train_df.target.value_counts()

In [None]:
wine_test_df.target.value_counts()

In [None]:
scaler = StandardScaler()

X_train_data = scaler.fit_transform(X_train)
X_test_data = scaler.transform(X_test)

## 표준화  안된 자료로 모델구성

In [None]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

## 표준화 후 모델구성

In [None]:
model = KNeighborsClassifier()
model.fit(X_train_data, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train_data, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test_data, y_test)))

In [None]:
estimator = make_pipeline(
            StandardScaler(),
            KNeighborsClassifier())


In [None]:
cross_validate( estimator = estimator,
               X=X, y=y,
               cv=5, 
               n_jobs = multiprocessing.cpu_count(),
               verbose=True)


In [None]:
pipe = Pipeline( 
    [('scaler', StandardScaler()),
     ('model', KNeighborsClassifier())])



In [None]:
param_grid = [{'model__n_neighbors': [3, 5, 7],
               'model__weights': ['uniform', 'distance'],
               'model__algorithm': ['ball_tree', 'kd_tree', 'brute']}]


In [None]:
gs = GridSearchCV(estimator= pipe,
                  param_grid=param_grid,
                  n_jobs=multiprocessing.cpu_count(),
                  verbose=True)

gs.fit(X, y)

In [None]:
gs.best_estimator_.get_params()

In [None]:
print('GridSearchCV best score {}'.format(gs.best_score_))

In [None]:
tsne = TSNE(n_components=2)
X_comp = tsne.fit_transform(X)

In [None]:
wine_comp_df = pd.DataFrame(data=X_comp)
wine_comp_df['target'] = y
wine_comp_df

In [None]:
plt.scatter(X_comp[:, 0], X_comp[:, 1], c=y, cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');

In [None]:
model = KNeighborsClassifier()
model.fit(X_comp, y)
predict = model.predict(X_comp)

In [None]:
xx, yy = make_meshgrid(X_comp[:, 0], X_comp[:, 1])
plot_contours(model, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X_comp[:, 0], X_comp[:, 1], c=y, cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');

## k 최근접 이웃 회귀

* k 최근접 이웃 분류와 마찬가지로 예측에 이웃 데이터 포인트 사용
* 이웃 데이터 포인트의 평균이 예측 결과

### 보스턴 주택 가격 데이터

In [None]:
boston = load_boston()

In [None]:
boston_df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
boston_df['Target'] = boston.target
boston_df

In [None]:
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [None]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale= scaler.transform(X_test)

In [None]:
model = KNeighborsRegressor()
model.fit(X_train, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("학습 데이터 점수: {}".format(model.score(X_test, y_test)))


In [None]:
model = KNeighborsRegressor()
model.fit(X_train_scale, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train_scale, y_train)))
print("학습 데이터 점수: {}".format(model.score(X_test_scale, y_test)))


In [None]:
estimator = make_pipeline(
            StandardScaler(),
            KNeighborsRegressor())


In [None]:
cross_validate( estimator = estimator,
               X=X, y=y,
               cv=5, 
               n_jobs = multiprocessing.cpu_count(),
               verbose=True)


In [None]:
pipe = Pipeline( 
    [('scaler', StandardScaler()),
     ('model', KNeighborsRegressor())])



In [None]:
param_grid = [{'model__n_neighbors': [3, 5, 7],
               'model__weights': ['uniform', 'distance'],
               'model__algorithm': ['ball_tree', 'kd_tree', 'brute']}]


In [None]:
gs = GridSearchCV(estimator= pipe,
                  param_grid=param_grid,
                  n_jobs=multiprocessing.cpu_count(),
                  verbose=True)

gs.fit(X, y)

In [None]:
gs.best_estimator_.get_params()

In [None]:
print('GridSearchCV best score {}'.format(gs.best_score_))

In [None]:
tsne = TSNE(n_components=1)
X_comp = tsne.fit_transform(X)

In [None]:
boston_comp_df = pd.DataFrame(data=X_comp)
boston_comp_df['target'] = y
boston_comp_df

In [None]:
plt.scatter(X_comp, y, c='b', cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');

In [None]:
model = KNeighborsRegressor()
model.fit(X_comp, y)
predict = model.predict(X_comp)

In [None]:
plt.scatter(X_comp, y, c='b', cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k')
plt.scatter(X_comp, predict, c='r', cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');

### 캘리포니아 주택 가격 데이터

In [None]:
california = fetch_california_housing()

In [None]:
california_df = pd.DataFrame(data=california.data, columns=california.feature_names)
california_df['Target'] = california.target
california_df

In [None]:
X, y = california.data, california.target
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [None]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale= scaler.transform(X_test)

In [None]:
model = KNeighborsRegressor()
model.fit(X_train, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("학습 데이터 점수: {}".format(model.score(X_test, y_test)))


In [None]:
model = KNeighborsRegressor()
model.fit(X_train_scale, y_train)

In [None]:
print("학습 데이터 점수: {}".format(model.score(X_train_scale, y_train)))
print("학습 데이터 점수: {}".format(model.score(X_test_scale, y_test)))


In [None]:
estimator = make_pipeline(
            StandardScaler(),
            KNeighborsRegressor())


In [None]:
cross_validate( estimator = estimator,
               X=X, y=y,
               cv=5, 
               n_jobs = multiprocessing.cpu_count(),
               verbose=True)


In [None]:
pipe = Pipeline( 
    [('scaler', StandardScaler()),
     ('model', KNeighborsRegressor())])



In [None]:
param_grid = [{'model__n_neighbors': [ 9, 11, 13, 15],
               'model__weights': ['uniform', 'distance'],
               'model__algorithm': ['ball_tree', 'kd_tree', 'brute']}]


In [None]:
gs = GridSearchCV(estimator= pipe,
                  param_grid=param_grid,
                  n_jobs=multiprocessing.cpu_count(),
                  verbose=True)

gs.fit(X, y)


In [None]:
gs.best_estimator_.get_params()

In [None]:
print('GridSearchCV best score {}'.format(gs.best_score_))

In [None]:
tsne = TSNE(n_components=1)
X_comp = tsne.fit_transform(X)

In [None]:
california_comp_df = pd.DataFrame(data=X_comp)
california_comp_df['target'] = y
california_comp_df

In [None]:
plt.scatter(X_comp, y, c='b', cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');

In [None]:
model = KNeighborsRegressor()
model.fit(X_comp, y)
predict = model.predict(X_comp)

In [None]:
plt.scatter(X_comp, y, c='b', cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k')
plt.scatter(X_comp, predict, c='r', cmap=plt.cm.coolwarm, 
            s= 20 , edgecolors='k');