<img src="../images/topcover.jpg" width="1000" height="50">

## Colorectal cancer dataset

In [3]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


In [4]:
colorectalx = pd.read_csv('../data/hmnist_28_28_L.csv')
colorectalx.drop(columns = 'label', inplace=True)
colorectaly = pd.read_csv('../data/hmnist_28_28_L.csv')

In [5]:
# set up data for modelling knn

X = colorectalx
y = colorectaly['label']

In [6]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

7    0.125
3    0.125
6    0.125
2    0.125
5    0.125
1    0.125
8    0.125
4    0.125
Name: label, dtype: float64

In [7]:
# split the data into the training and testing sets

X_train, X_val, y_train, y_val = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [8]:
# standard scaler applied

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)


## K Nearest Neighbour Model

In [9]:
k_range = list(range(1, 10))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)

In [10]:
#KNN using GridSearch to find optimum KNN value

knn = KNeighborsClassifier() 
opt_knn = GridSearchCV(knn, param_grid, cv=5)
opt_knn.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'weights': ['uniform', 'distance']})

In [11]:
# check knn best parameter

opt_knn.best_params_

{'n_neighbors': 2, 'weights': 'uniform'}

In [12]:
# generate predictions
predictions1 = opt_knn.predict(X_val)

In [13]:
opt_knn.score(X_train, y_train)

0.5713432835820895

In [14]:
opt_knn.score(X_val, y_val)

0.4315151515151515

#### KNN model has a lower score than Random Forests

**Summary table for Colorectal Cancer Classification models for 8 Tissue type classes:**

| Model| Test Accuracy|Baseline score|
|:---------:|:---:|:--------:|
|  Random Forests |    0.684 |  0.125  |
|KNN| 0.432| 0.125|
