In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler, normalize
from sklearn.metrics import roc_auc_score,log_loss,accuracy_score
from sklearn.pipeline import Pipeline

In [34]:
kyp = pd.read_csv(r"C:\Users\Administrator.DAI-PC2\Downloads\Shubham\Practical Machine Learning\ClassWork\Cases\Kyphosis\Kyphosis.csv")
le = LabelEncoder()
y = le.fit_transform(kyp["Kyphosis"])
X = kyp.drop('Kyphosis',axis=1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=24,stratify =y)

In [4]:
## without scalling

In [5]:
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)

In [6]:
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred_proba[:,1]))
print(log_loss(y_test,y_pred_proba))

0.84
0.78
0.37040121807881654


In [None]:
### with normalizing

In [39]:
X_norm_trn = normalize(X_train)
X_norm_tst = normalize(X_test)

knn = KNeighborsClassifier(n_neighbors=8)

knn.fit(X_norm_trn,y_train)

y_pred = knn.predict(X_norm_tst)
y_pred_proba = knn.predict_proba(X_norm_tst)

In [40]:
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred_proba[:,1]))
print(log_loss(y_test,y_pred_proba))

0.76
0.74
0.46669084720049014


In [7]:
###with standard scaler

In [8]:
scl_std = StandardScaler()
X_scl_trn = scl_std.fit_transform(X_train)
X_scl_tst = scl_std.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=8)

knn.fit(X_scl_trn,y_train)

y_pred = knn.predict(X_scl_tst)
y_pred_proba = knn.predict_proba(X_scl_tst)

In [9]:
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred_proba[:,1]))
print(log_loss(y_test,y_pred_proba))

0.76
0.84
0.36779148200891243


In [10]:
###with MinMax scaler

In [11]:
scl_mm = MinMaxScaler()
X_mm_trn = scl_mm.fit_transform(X_train)
X_mm_tst = scl_mm.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_mm_trn,y_train)

y_pred = knn.predict(X_mm_tst)
y_pred_proba = knn.predict_proba(X_mm_tst)

In [12]:
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred_proba[:,1]))
print(log_loss(y_test,y_pred_proba))

0.76
0.845
1.7425616149199414


In [13]:
### KNN using pipeline for standard scalling

In [14]:
knn = KNeighborsClassifier(n_neighbors=8)

scl_std = StandardScaler()
pipe_std = Pipeline([("StdScalling",scl_std),("Knn",knn)])

pipe_std.fit(X_train,y_train)
y_pred = pipe_std.predict(X_test)
y_pred_proba = pipe_std.predict_proba(X_test)

In [15]:
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred_proba[:,1]))
print(log_loss(y_test,y_pred_proba))

0.76
0.84
0.36779148200891243


In [16]:
### KNN using pipeline for MinMax scalling

In [17]:
knn = KNeighborsClassifier(n_neighbors=3)

scl_mm = MinMaxScaler()
pipe_mm = Pipeline([("MinMaxScalling",scl_mm),("Knn",knn)])

pipe_mm.fit(X_train,y_train)
y_pred = pipe_mm.predict(X_test)
y_pred_proba = pipe_mm.predict_proba(X_test)

In [18]:
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred_proba[:,1]))
print(log_loss(y_test,y_pred_proba))

0.76
0.845
1.7425616149199414


In [19]:
### KNN using pipeline for standard scalling and Gridsearch

In [21]:
knn = KNeighborsClassifier()
scl_std = StandardScaler()
pipe_std = Pipeline([("StdScalling",scl_std),("Knn",knn)])

print(pipe_std.get_params())

{'memory': None, 'steps': [('StdScalling', StandardScaler()), ('Knn', KNeighborsClassifier())], 'verbose': False, 'StdScalling': StandardScaler(), 'Knn': KNeighborsClassifier(), 'StdScalling__copy': True, 'StdScalling__with_mean': True, 'StdScalling__with_std': True, 'Knn__algorithm': 'auto', 'Knn__leaf_size': 30, 'Knn__metric': 'minkowski', 'Knn__metric_params': None, 'Knn__n_jobs': None, 'Knn__n_neighbors': 5, 'Knn__p': 2, 'Knn__weights': 'uniform'}


In [22]:
params = {"Knn__n_neighbors":np.arange(1,11)}
kfold = StratifiedKFold(n_splits =5,shuffle = True, random_state = 24)
gcv = GridSearchCV(pipe_std,param_grid = params,cv=kfold, scoring="neg_log_loss")
gcv.fit(X,y)


In [23]:
print(gcv.best_params_)
print(gcv.best_score_)

{'Knn__n_neighbors': 10}
-0.3545562027841026


In [None]:
### KNN using pipeline for minmax scalling and Gridsearch

In [25]:
knn = KNeighborsClassifier()
scl_mm = MinMaxScaler()
pipe_mm = Pipeline([("MinMaxScalling",scl_mm),("Knn",knn)])

print(pipe_mm.get_params())

{'memory': None, 'steps': [('MinMaxScalling', MinMaxScaler()), ('Knn', KNeighborsClassifier())], 'verbose': False, 'MinMaxScalling': MinMaxScaler(), 'Knn': KNeighborsClassifier(), 'MinMaxScalling__clip': False, 'MinMaxScalling__copy': True, 'MinMaxScalling__feature_range': (0, 1), 'Knn__algorithm': 'auto', 'Knn__leaf_size': 30, 'Knn__metric': 'minkowski', 'Knn__metric_params': None, 'Knn__n_jobs': None, 'Knn__n_neighbors': 5, 'Knn__p': 2, 'Knn__weights': 'uniform'}


In [26]:
params = {"Knn__n_neighbors":np.arange(1,11)}
kfold = StratifiedKFold(n_splits =5,shuffle = True, random_state = 24)
gcv = GridSearchCV(pipe_mm,param_grid = params,cv=kfold, scoring="neg_log_loss")
gcv.fit(X,y)

In [27]:
print(gcv.best_params_)
print(gcv.best_score_)

{'Knn__n_neighbors': 9}
-0.3541342613432673


In [28]:
### KNN using pipeline for different scalling and Gridsearch

In [29]:
knn = KNeighborsClassifier()
scl_mm = MinMaxScaler()
scl_std = StandardScaler()
pipe = Pipeline([("Scl",None),("Knn",knn)])

print(pipe.get_params())

{'memory': None, 'steps': [('Scl', None), ('Knn', KNeighborsClassifier())], 'verbose': False, 'Scl': None, 'Knn': KNeighborsClassifier(), 'Knn__algorithm': 'auto', 'Knn__leaf_size': 30, 'Knn__metric': 'minkowski', 'Knn__metric_params': None, 'Knn__n_jobs': None, 'Knn__n_neighbors': 5, 'Knn__p': 2, 'Knn__weights': 'uniform'}


In [30]:
params = {"Knn__n_neighbors":np.arange(1,11),"Scl":[scl_mm,scl_std,None]}
kfold = StratifiedKFold(n_splits =5,shuffle = True, random_state = 24)
gcv = GridSearchCV(pipe,param_grid = params,cv=kfold, scoring="neg_log_loss")
gcv.fit(X,y)

In [31]:
print(gcv.best_params_)
print(gcv.best_score_)

{'Knn__n_neighbors': 9, 'Scl': MinMaxScaler()}
-0.3541342613432673
