# KNN for TCGA cancer detection based on the miRNA-seq

## Dataset : isoforms from 17 different classes of cancer from TCGA

##Labels:
* BLCA: bladder 
* BRCA: breast 
* CHOL: bile duct 
* COAD: colon
* ESCA: esophagus 
* HNSC: head and neck
* KICH: kidney chromophobe
* KIRC: kidney renal clear cell
* LICH: liver
* LUAD: lung
* PRAD: prostate
* STAD: stomach
* THCA: thyroid
* UCEC: uterus
* PAAD: pancreas
* SKCM: skin melanoma
* OV: ovary

##Refrenece: https://github.com/programmingprincess/tumor-origin

In [None]:
import numpy
import pandas as pd 

from numpy import random

from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import pickle
import bz2

In [None]:
!wget "-N" "https://github.com/programmingprincess/tumor-origin/raw/master/mirna/raw.txt"
!wget "-N" "https://github.com/programmingprincess/tumor-origin/raw/master/mirna/types-numeric.txt"
!wget "-N" "https://github.com/programmingprincess/tumor-origin/raw/master/mirna/types-labels.txt"

--2023-02-24 23:07:05--  https://github.com/programmingprincess/tumor-origin/raw/master/mirna/raw.txt
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/programmingprincess/tumor-origin/master/mirna/raw.txt [following]
--2023-02-24 23:07:05--  https://raw.githubusercontent.com/programmingprincess/tumor-origin/master/mirna/raw.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 55798810 (53M) [text/plain]
Saving to: ‘raw.txt’


Last-modified header missing -- time-stamps turned off.
2023-02-24 23:07:06 (95.6 MB/s) - ‘raw.txt’ saved [55798810/55798810]

--2023-02-24 23:07:06--  https://github.com/pro

In [None]:
data = pd.read_csv('raw.txt', sep='\t')
types = pd.read_csv('types-numeric.txt', sep='\t')
labels = pd.read_csv('types-labels.txt', sep='\t')

In [None]:
# train test split 
random.seed(69)
ii = numpy.random.rand(len(data)) < 0.7 

np_data = data.values
np_types = types.values
np_labels = labels.values

train = np_data[ii]
test = np_data[~ii]

pand_train = data[ii]
pand_test = data[~ii]

# types = numbers assigned (0-16)
train_types = np_types[ii]
test_types = np_types[~ii]

# labels = string values assigned (then one-hot encoded later)
train_labels = np_labels[ii]
test_labels = np_labels[~ii] 

# ravel 
r_train_types = train_types.ravel()
r_test_types = test_types.ravel()

r_train_labels = train_labels.ravel()
r_test_labels = test_labels.ravel()

In [None]:
# KNN 
knnmodel = KNeighborsClassifier(n_neighbors=3)
knn = knnmodel.fit(train, r_train_labels) 

knn_pred = knn.predict(test)

In [None]:
# Model Accuracy, how often is the classifier correct
print("Accuracy: ", metrics.accuracy_score(r_test_labels, knn_pred))

knn_cm = confusion_matrix(r_test_labels, knn_pred,)

y_true = pd.Series(r_test_labels)
knn_pred = pd.Series(knn_pred)

pd.crosstab(y_true, knn_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy:  0.8320707070707071


Predicted,blca,brca,chol,coad,esca,hnsc,kich,kirc,lich,luad,ov,paad,prad,skcm,stad,thca,ucec,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
blca,100,15,0,3,13,1,0,0,0,0,0,0,0,0,0,0,0,132
brca,10,358,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,373
chol,1,1,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,12
coad,4,3,1,145,1,0,0,0,0,0,0,0,0,0,0,0,0,154
esca,31,3,0,6,25,0,0,0,0,0,0,0,0,0,1,0,0,66
hnsc,2,0,0,0,2,136,0,1,2,9,0,0,2,0,3,0,0,157
kich,0,0,0,0,0,0,27,3,0,0,0,0,0,0,0,0,0,30
kirc,1,2,0,0,0,1,2,170,0,1,0,0,0,0,0,0,1,178
lich,1,1,0,0,0,4,0,2,92,7,0,1,3,0,5,3,1,120
luad,3,7,0,1,0,8,0,2,4,114,1,11,3,1,7,6,5,173


In [None]:
# KNN classification report 
print(classification_report(y_true, knn_pred))

print(accuracy_score(y_true, knn_pred, normalize=True, sample_weight=None))

              precision    recall  f1-score   support

        blca       0.60      0.76      0.67       132
        brca       0.87      0.96      0.91       373
        chol       0.90      0.75      0.82        12
        coad       0.90      0.94      0.92       154
        esca       0.58      0.38      0.46        66
        hnsc       0.81      0.87      0.84       157
        kich       0.90      0.90      0.90        30
        kirc       0.87      0.96      0.91       178
        lich       0.81      0.77      0.79       120
        luad       0.66      0.66      0.66       173
          ov       0.99      0.96      0.97       144
        paad       0.64      0.54      0.59        50
        prad       0.85      0.93      0.89       170
        skcm       0.97      0.76      0.85       123
        stad       0.79      0.66      0.72       146
        thca       0.93      0.89      0.91       169
        ucec       0.89      0.78      0.83       179

    accuracy              

In [None]:
# Save the trained model
ofile = bz2.BZ2File("knn_model.pk.bz2",'wb')
pickle.dump(knn,ofile)
# close the file
ofile.close()