In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
url = 'https://drive.google.com/file/d/1oXrF8KHsElHXI_u-BGuijMe4F83U7dvj/view?usp=drive_link'
url = 'https://drive.google.com/uc?id='+url.split('/')[-2]
df = pd.read_csv(url)

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [45]:
X = df.drop('Cancer Present',axis=1)
y = df['Cancer Present']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [47]:
scaler = StandardScaler()

In [48]:
knn = KNeighborsClassifier()

In [49]:
operations = [('scaler',scaler),('knn',knn)]

In [50]:
from sklearn.pipeline import Pipeline

In [51]:
pipe = Pipeline(operations)

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
k_values = list(range(1,20))

---
*Note: If your parameter grid is going inside a PipeLine, your parameter name needs to be specified in the following manner:**

* chosen_string_name + **two** underscores + parameter key name
* model_name + __ + parameter name
* knn_model + __ + n_neighbors
* knn_model__n_neighbors

[StackOverflow on this](https://stackoverflow.com/questions/41899132/invalid-parameter-for-sklearn-estimator-pipeline)

The reason we have to do this is because it let's scikit-learn know what operation in the pipeline these parameters are related to (otherwise it might think n_neighbors was a parameter in the scaler).

---

In [63]:
param_grid = {'knn__n_neighbors': k_values }

In [64]:
full_cv_classifier = GridSearchCV(pipe,param_grid,cv=5,scoring='accuracy')

In [65]:
# Use full X and y if you DON'T want a hold-out test set
# Use X_train and y_train if you DO want a holdout test set (X_test,y_test)
full_cv_classifier.fit(X_train,y_train)

In [66]:
full_cv_classifier.best_estimator_.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('knn', KNeighborsClassifier(n_neighbors=19))],
 'verbose': False,
 'scaler': StandardScaler(),
 'knn': KNeighborsClassifier(n_neighbors=19),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 19,
 'knn__p': 2,
 'knn__weights': 'uniform'}

In [67]:
full_cv_classifier.cv_results_['mean_test_score']

array([0.89666667, 0.90333333, 0.91714286, 0.91761905, 0.92428571,
       0.92333333, 0.92666667, 0.92714286, 0.92666667, 0.92714286,
       0.9247619 , 0.92714286, 0.92857143, 0.92809524, 0.92571429,
       0.92952381, 0.92666667, 0.93      , 0.93047619])

In [71]:
single_sample = X_test.iloc[20]

In [None]:
full_cv_classifier.predict(single_sample.values.reshape(1, -1))

In [75]:
full_cv_classifier.predict_proba(single_sample.values.reshape(1, -1))



array([[0., 1.]])