In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# K Nearest Neighbors

In [None]:
gene = pd.read_csv('../../DATA/gene_expression.csv')
gene.head()

In [None]:
sns.scatterplot(data=gene,x='Gene One',y='Gene Two',hue='Cancer Present',alpha=0.5)

In [None]:
# len(gene)
sns.pairplot(data=gene,hue='Cancer Present')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X = gene.drop('Cancer Present',axis=1)
y = gene['Cancer Present']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [None]:
scalar = StandardScaler()

In [None]:
scaled_X_train = scalar.fit_transform(X_train)
scaled_X_test = scalar.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn_model.fit(scaled_X_train,y_train)

In [None]:
y_pred = knn_model.predict(scaled_X_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
test_error_rates = []

for k in range(1,30):
  knn_model = KNeighborsClassifier(n_neighbors=k)
  knn_model.fit(scaled_X_train,y_train)
  
  y_pred_test = knn_model.predict(scaled_X_test)
  # confusion_matrix(y_test,y_pred)
  # print(classification_report(y_test,y_pred))
  acc = accuracy_score(y_test,y_pred)
  error = str(round(100 * (1 - acc),2)) + "%"
  test_error_rates.append(error)

In [None]:
test_error_rates

In [None]:
plt.plot(range(1,30),test_error_rates)
plt.ylabel('ERROR RATE')
plt.xlabel('K Neighbors')
plt.show()

In [None]:
scalar

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn.get_params().keys()

In [None]:
operations = [('scalar',scalar),('knn',knn)]

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(operations)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
k_values = list(range(1,20))
k_values

In [None]:
param_grid = {'knn__n_neighbors':k_values}

In [None]:
full_cv_classifier = GridSearchCV(pipe,param_grid,cv=5,scoring='accuracy')

In [None]:
full_cv_classifier.fit(X_train,y_train)

In [None]:
full_cv_classifier.best_estimator_.get_params()

In [None]:
prediction = full_cv_classifier.predict(X_test)
prediction

In [None]:
print(classification_report(y_test,prediction))

In [None]:
new_patient = [[3.8,7.3]]
full_cv_classifier.predict(new_patient)

In [None]:
full_cv_classifier.predict_proba(new_patient)