In [1]:
#data handling
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import MinMaxScaler

#feature selection
from sklearn.feature_selection import mutual_info_classif

#classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# performance metrics
from sklearn.metrics import f1_score,precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import roc_auc_score
# remember confusion matrices lets import that and execute it later

In [4]:
# read datasets
df = pd.read_csv("/kaggle/input/dataset/input_data1.csv", sep=',', index_col=[0])
labels = pd.read_csv("/kaggle/input/dataset/labels.csv", sep=',', index_col=[0])

In [5]:
labels.shape

(9285, 1)

In [6]:
df.shape

(984210, 106)

In [9]:
data = df.head(9285)

In [10]:
data.shape

(9285, 106)

In [11]:
# Place the DataFrames side by side
data1 = pd.concat([data, labels], axis=1)

In [12]:
data1.shape

(9285, 107)

In [15]:
data1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,97,98,99,100,101,102,103,104,105,labels
0,0.019117,0.016245,0.028272,0.006216,0.018647,0.029473,0.0,0.013027,0.011765,0.050682,...,0.026166,0.0,0.020727,0.0,0.037617,0.031778,0.003922,0.044494,0.04072,ACC
1,0.003922,0.035371,0.034437,0.043107,0.034786,0.016659,0.019243,0.007843,0.003922,0.006216,...,0.031031,0.0,0.039287,0.0,0.0,0.026111,0.025135,0.039395,0.003922,ACC
2,0.033487,0.003922,0.003922,0.0,0.0,0.021279,0.0,0.0,0.037683,0.041041,...,0.006216,0.030088,0.0,0.0,0.059308,0.029596,0.028234,0.035647,0.0,ACC
3,0.0,0.0,0.03096,0.019608,0.044759,0.023164,0.013566,0.043396,0.028678,0.047617,...,0.0,0.0,0.032019,0.034462,0.0,0.029745,0.031054,0.006216,0.0,ACC
4,0.0,0.003922,0.003922,0.025644,0.003922,0.0,0.014059,0.015321,0.028713,0.040769,...,0.066211,0.003922,0.0,0.044695,0.033798,0.020115,0.0,0.033238,0.038278,ACC


In [16]:
print(data1['labels'].value_counts())

BRCA    1095
KIRC     533
HNSC     520
LGG      516
LUAD     515
THCA     505
LUSC     502
PRAD     497
STAD     415
BLCA     408
LIHC     371
CESC     304
OV       304
KIRP     290
COAD     285
SARC     259
ESCA     184
PCPG     179
PAAD     178
UCEC     176
LAML     173
GBM      156
TGCT     150
THYM     120
SKCM     103
READ      94
MESO      87
UVM       80
ACC       79
KICH      66
UCS       57
DLBC      48
CHOL      36
Name: labels, dtype: int64


In [20]:
X=data1.iloc[:,0:-1]
y=data1.iloc[:,-1]

In [22]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,0.019117,0.016245,0.028272,0.006216,0.018647,0.029473,0.0,0.013027,0.011765,0.050682,...,0.019951,0.026166,0.0,0.020727,0.0,0.037617,0.031778,0.003922,0.044494,0.04072
1,0.003922,0.035371,0.034437,0.043107,0.034786,0.016659,0.019243,0.007843,0.003922,0.006216,...,0.046022,0.031031,0.0,0.039287,0.0,0.0,0.026111,0.025135,0.039395,0.003922
2,0.033487,0.003922,0.003922,0.0,0.0,0.021279,0.0,0.0,0.037683,0.041041,...,0.023258,0.006216,0.030088,0.0,0.0,0.059308,0.029596,0.028234,0.035647,0.0
3,0.0,0.0,0.03096,0.019608,0.044759,0.023164,0.013566,0.043396,0.028678,0.047617,...,0.013027,0.0,0.0,0.032019,0.034462,0.0,0.029745,0.031054,0.006216,0.0
4,0.0,0.003922,0.003922,0.025644,0.003922,0.0,0.014059,0.015321,0.028713,0.040769,...,0.043624,0.066211,0.003922,0.0,0.044695,0.033798,0.020115,0.0,0.033238,0.038278


In [23]:
#let's encode target labels (y) with values between 0 and n_classes-1.
#encoding will be done using the LabelEncoder
label_encoder=LabelEncoder()
label_encoder.fit(y)
y_encoded=label_encoder.transform(y)
labels=label_encoder.classes_
classes=np.unique(y_encoded)

In [24]:
labels

array(['ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA',
       'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC',
       'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ',
       'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS',
       'UVM'], dtype=object)

In [25]:
classes

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])

In [26]:
#split data into training and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y_encoded,test_size=0.2,random_state=42)

In [28]:
 #scale data between 0 and 1
min_max_scaler=MinMaxScaler()
X_train_norm=min_max_scaler.fit_transform(X_train)
X_test_norm=min_max_scaler.fit_transform(X_test)

In [29]:
MI=mutual_info_classif(X_train_norm,y_train)

In [30]:
#select top n features. lets say 300.
#you can modify the value and see how the performance of the model changes

n_features=300
selected_scores_indices=np.argsort(MI)[::-1][0:n_features]

In [31]:
X_train_selected=X_train_norm[:,selected_scores_indices]
X_test_selected=X_test_norm[:,selected_scores_indices]

In [32]:
X_train_selected.shape

(7428, 106)

In [33]:
X_test_selected.shape

(1857, 106)

In [34]:
#Random Forest Classifier
#because we are dealing with multiclass data, the one versus rest strategy is used.
#learn to predict each class against the other.

RF=OneVsRestClassifier(RandomForestClassifier(max_features=0.2))
RF.fit(X_train_selected,y_train)
y_pred =RF.predict(X_test_selected)
pred_prob = RF.predict_proba(X_test_selected)

In [35]:
#precision
precision=np.round(precision_score(y_test,y_pred,average = 'weighted'),4)
print('precision:%0.4f'%precision)

#recall
recall=np.round(recall_score(y_test,y_pred,average = 'weighted'),4)
print('recall:%0.4f'%recall)

#f1score
f1score=np.round(f1_score(y_test,y_pred,average = 'weighted'),4)
print('f1score:%0.4f'%f1score)


report=classification_report(y_test,y_pred, target_names=labels)
print('\n')
print('classification report\n\n')
print(report)

precision:0.0535
recall:0.0845
f1score:0.0587


classification report


              precision    recall  f1-score   support

         ACC       0.00      0.00      0.00        18
        BLCA       0.04      0.04      0.04        82
        BRCA       0.14      0.38      0.21       220
        CESC       0.04      0.02      0.02        62
        CHOL       0.00      0.00      0.00         5
        COAD       0.00      0.00      0.00        54
        DLBC       0.00      0.00      0.00        10
        ESCA       0.00      0.00      0.00        30
         GBM       0.00      0.00      0.00        30
        HNSC       0.05      0.03      0.03       120
        KICH       0.00      0.00      0.00        15
        KIRC       0.06      0.04      0.05       109
        KIRP       0.00      0.00      0.00        64
        LAML       0.00      0.00      0.00        29
         LGG       0.04      0.03      0.04        97
        LIHC       0.00      0.00      0.00        83
        L

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
from sklearn.metrics import confusion_matrix
cm_df=confusion_matrix(y_test, y_pred)

In [37]:
cm_df

array([[ 0,  1,  3, ...,  0,  0,  0],
       [ 0,  3, 16, ...,  0,  0,  0],
       [ 0, 11, 84, ...,  2,  1,  0],
       ...,
       [ 0,  0,  2, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  1, ...,  1,  1,  0]])

In [41]:
import pickle
pickle.dump(RF, open('model.pkl', 'wb'))