In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import seaborn as sns
import pandas as pd

In [5]:
# Loading the dataset
glioma_df = pd.read_csv("./TCGA_InfoWithGrade.csv")

In [6]:
glioma_df.nunique()

Unnamed: 0           839
Gender                 2
Age_at_diagnosis     766
IDH1                   2
TP53                   2
ATRX                   2
PTEN                   2
EGFR                   2
CIC                    2
MUC16                  2
PIK3CA                 2
NF1                    2
PIK3R1                 2
FUBP1                  2
RB1                    2
NOTCH1                 2
BCOR                   2
CSMD3                  2
SMARCA4                2
GRIN2A                 2
IDH2                   2
FAT4                   2
PDGFRA                 2
Primary Diagnosis      6
Grade                  2
dtype: int64

In [7]:
glioma_df = glioma_df.drop("Unnamed: 0", axis=1)
glioma_df.head()

Unnamed: 0,Gender,Age_at_diagnosis,IDH1,TP53,ATRX,PTEN,EGFR,CIC,MUC16,PIK3CA,...,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA,Primary Diagnosis,Grade
0,0,51.3,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,4,0
1,0,38.72,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,3,0
2,0,35.17,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,32.78,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
4,0,31.51,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
# Using k = 10 as recommended by dataset authors
kfold = KFold(n_splits=10, shuffle=True)

In [9]:
feature_df = glioma_df[glioma_df.columns[:-2]]
multi_class_df = glioma_df["Primary Diagnosis"]
binary_class_df = glioma_df["Grade"]

In [10]:
feature_df.head()

Unnamed: 0,Gender,Age_at_diagnosis,IDH1,TP53,ATRX,PTEN,EGFR,CIC,MUC16,PIK3CA,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,51.3,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,0,38.72,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,35.17,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,32.78,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,31.51,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
multi_class_df.shape

(839,)

In [12]:
binary_class_df.shape

(839,)

In [13]:
kfold.get_n_splits(feature_df)

10

In [11]:
print(kfold)

KFold(n_splits=10, random_state=None, shuffle=True)


In [None]:
print([(i,j) for i,j in kfold.split(feature_df)])

In [14]:
#initializing Multi Layer Perceptron model with parameters
glioma_classifier_MLP = MLPClassifier(hidden_layer_sizes=(44, 44, 44,44), activation="relu", solver="adam",learning_rate="invscaling", verbose=False, max_iter=500)
glioma_classifier_RF = RandomForestClassifier(n_estimators=2000, criterion="gini", verbose=True)
glioma_classifier_ada = AdaBoostClassifier(n_estimators=15)

In [15]:
MLP_scores = []
RF_scores = []
ADA_scores = []
Fold_No = 1
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    
    # Multi-Layered-Perceptron
    glioma_classifier_MLP.fit(feature_train, label_train)
    predictions = glioma_classifier_MLP.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    MLP_scores.append((accuracy_pct, other_metrics))
    
    # Random Forest
    glioma_classifier_RF.fit(feature_train, label_train)
    predictions = glioma_classifier_RF.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    RF_scores.append((accuracy_pct, other_metrics))

    # Ada Boost
    glioma_classifier_ada.fit(feature_train, label_train)
    predictions = glioma_classifier_ada.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    ADA_scores.append((accuracy_pct, other_metrics))

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    2.5s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    3.9s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:    5.5s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    2.4s
[Parallel(n_jobs=1)]

In [20]:
print(MLP_scores[1])
print(f"Average Accuracy Of MLP: {(sum([i[0] for i in MLP_scores])/ len(MLP_scores))*100}%")
print(f"Precision Of MLP: {(sum([i[1][0] for i in MLP_scores])/ len(MLP_scores))*100}%")
print(f"Recall Of MLP: {(sum([i[1][1] for i in MLP_scores])/ len(MLP_scores))*100}%")
print(f"F1_score Of MLP: {(sum([i[1][2] for i in MLP_scores])/ len(MLP_scores))*100}%")
print("\n\n")
print(f"Average Accuracy Of RF: {(sum([i[0] for i in RF_scores])/ len(MLP_scores))*100}%")
print(f"Precision Of RF: {(sum([i[1][0] for i in RF_scores])/ len(MLP_scores))*100}%")
print(f"Recall Of RF: {(sum([i[1][1] for i in RF_scores])/ len(MLP_scores))*100}%")
print(f"F1_score Of RF: {(sum([i[1][2] for i in RF_scores])/ len(MLP_scores))*100}%")
print("\n\n")
print(f"Average Accuracy Of ADA: {(sum([i[0] for i in ADA_scores])/ len(MLP_scores))*100}%")
print(f"Precision Of ADA: {(sum([i[1][0] for i in ADA_scores])/ len(MLP_scores))*100}%")
print(f"Recall Of ADA: {(sum([i[1][1] for i in ADA_scores])/ len(MLP_scores))*100}%")
print(f"F1_score Of ADA: {(sum([i[1][2] for i in ADA_scores])/ len(MLP_scores))*100}%")

#print(f"Average Accuracy Of Random Forest: {(sum(i[0] for i in RF_scores)/ len(RF_scores))*100}%")
#print(f"Average Accuracy Of AdaBoost: {(sum([i[0] for i in ADA_scores])/ len(ADA_scores))*100}%")


(0.5952380952380952, (np.float64(0.5952380952380952), np.float64(0.5952380952380952), np.float64(0.5952380952380952), None))
Average Accuracy Of MLP: 55.071715433161216%
Precision Of MLP: 55.07171543316123%
Recall Of MLP: 55.07171543316123%
F1_score Of MLP: 55.07171543316123%



Average Accuracy Of RF: 48.99741824440619%
Precision Of RF: 48.99741824440619%
Recall Of RF: 48.99741824440619%
F1_score Of RF: 48.99741824440619%



Average Accuracy Of ADA: 53.641709695926565%
Precision Of ADA: 53.641709695926565%
Recall Of ADA: 53.641709695926565%
F1_score Of ADA: 53.641709695926565%


## Using KNN and SVM

In [14]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [15]:
glioma_classifier_SVM = SVC(C=0.60, kernel="poly",degree=5, verbose=True)
glioma_classifier_KNN = KNeighborsClassifier(n_neighbors=17, weights="distance")
glioma_classifier_KNN

In [16]:
SVM_scores, KNN_Scores = [],[]
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    # SVM
    glioma_classifier_SVM.fit(feature_train, label_train)
    predictions = glioma_classifier_SVM.predict(feature_test)
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    SVM_scores.append((accuracy_pct, other_metrics))
    print("starting KNN")
    # KNN
    glioma_classifier_KNN.fit(feature_train, label_train)
    predictions = glioma_classifier_KNN.predict(feature_test)
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    KNN_Scores.append((accuracy_pct, other_metrics))

[LibSVM]*....
*.
*..
*
optimization finished, #iter = 847
obj = -59.999334, rho = 1.000041
nSV = 115, nBSV = 89
*.
*.
*
optimization finished, #iter = 505
obj = -59.980055, rho = 0.998217
nSV = 108, nBSV = 97
.
*.
*
optimization finished, #iter = 366
obj = -59.999040, rho = 1.000354
nSV = 113, nBSV = 91
..
*.............
*
optimization finished, #iter = 2276
obj = -59.991485, rho = 0.999285
nSV = 104, nBSV = 93
*
optimization finished, #iter = 73
obj = -52.973565, rho = -1.038183
nSV = 90, nBSV = 88
..
**....
*.
*
optimization finished, #iter = 2718
obj = -137.816990, rho = 0.994557
nSV = 239, nBSV = 223
*
optimization finished, #iter = 150
obj = -132.654002, rho = 1.007309
nSV = 224, nBSV = 222
...
*........*
optimization finished, #iter = 2445
obj = -114.473893, rho = -1.000083
nSV = 196, nBSV = 191
.
*
optimization finished, #iter = 294
obj = -76.678012, rho = -1.011479
nSV = 131, nBSV = 128
*
optimization finished, #iter = 188
obj = -136.481041, rho = 1.001741
nSV = 232, nBSV = 229

In [17]:
print(f"Average Accuracy Of SVM: {(sum([i[0] for i in SVM_scores])/ len(SVM_scores))*100}%")
print(f"Average Accuracy Of KNN: {(sum(i[0] for i in KNN_Scores)/ len(KNN_Scores))*100}%")


Average Accuracy Of SVM: 41.95065978198508%
Average Accuracy Of KNN: 49.3459552495697%


# fine tuning MLP due to highest accuracy

In [5]:
from sklearn.model_selection import GridSearchCV, train_test_split

In [6]:
param_grid = {
    'hidden_layer_sizes': [(24,24,24), (30, 30, 30), (40, 40, 40), (40, 40, 40, 40),(43,44, 45, 46)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.0005, 0.00095, 0.001, 0.0025, 0.005],
    'learning_rate_init': [0.001, 0.005,0.0075, 0.01, 0.025, 0.05],
    'batch_size': ['auto', 16, 32],
}
glioma_classifier_MLP_finetuned = MLPClassifier(max_iter=600, random_state=42)

In [7]:
gridsearch = GridSearchCV(estimator=glioma_classifier_MLP_finetuned, param_grid=param_grid, scoring="accuracy", cv=5, n_jobs=-1)

In [None]:
    feature_train, feature_test, label_train, label_test = train_test_split(feature_df, multi_class_df, shuffle=True, test_size=0.2)
    # Multi-Layered-Perceptron
    gridsearch.fit(feature_train, label_train)

""" 
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    glioma_classifier_MLP.fit()
    predictions = glioma_classifier_MLP.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    MLP_scores.append((accuracy_pct, other_metrics))
"""

In [201]:
best_glioma_mlp = gridsearch.best_estimator_

In [202]:
MLP_scores = []
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    best_glioma_mlp.fit(feature_train, label_train)
    predictions = best_glioma_mlp.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    MLP_scores.append((accuracy_pct, other_metrics))



In [203]:
print(f"Average Accuracy: {(sum(i[0] for i in MLP_scores)/ len(MLP_scores))*100}%")

Average Accuracy: 55.05880665519219%


In [8]:
#Saving model

In [205]:
import pickle
with open('model.pkl','wb') as f:
    pickle.dump(best_glioma_mlp,f)

In [18]:
# testing Binary classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [19]:
glioma_grade_Logistic = LogisticRegression(max_iter=500)
glioma_grade_SVM = SVC()

In [55]:
LR_scores = []
SVM_scores = []
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = binary_class_df.iloc[train_indices], binary_class_df.iloc[test_indices]
    glioma_grade_Logistic.fit(feature_train, label_train)
    predictions = glioma_grade_Logistic.predict(feature_test)
    other_scores = precision_recall_fscore_support(label_test, predictions)
    print(other_scores)
    LR_scores.append((accuracy_score(label_test, predictions)*100, other_scores))
    glioma_grade_SVM.fit(feature_train, label_train)
    predictions = glioma_grade_SVM.predict(feature_test)
    other_score = precision_recall_fscore_support(label_test, predictions)
    SVM_scores.append((accuracy_score(label_test, predictions)*100, other_score))



(array([0.9047619 , 0.88095238]), array([0.88372093, 0.90243902]), array([0.89411765, 0.89156627]), array([43, 41]))
(array([0.92857143, 0.83333333]), array([0.84782609, 0.92105263]), array([0.88636364, 0.875     ]), array([46, 38]))
(array([0.97560976, 0.8372093 ]), array([0.85106383, 0.97297297]), array([0.90909091, 0.9       ]), array([47, 37]))
(array([0.925     , 0.70454545]), array([0.74      , 0.91176471]), array([0.82222222, 0.79487179]), array([50, 34]))
(array([1.        , 0.72916667]), array([0.73469388, 1.        ]), array([0.84705882, 0.84337349]), array([49, 35]))
(array([0.96      , 0.88235294]), array([0.92307692, 0.9375    ]), array([0.94117647, 0.90909091]), array([52, 32]))
(array([0.93617021, 0.78378378]), array([0.84615385, 0.90625   ]), array([0.88888889, 0.84057971]), array([52, 32]))
(array([0.83333333, 0.88095238]), array([0.875     , 0.84090909]), array([0.85365854, 0.86046512]), array([40, 44]))
(array([0.88636364, 0.75      ]), array([0.79591837, 0.85714286]

In [53]:
import statistics

# Calculate average accuracy
print(f"Average Accuracy Of LR: {statistics.mean([i[0] for i in LR_scores])} %")

# Calculate average precision
precision_scores = [i[1][0].mean() for i in LR_scores]  # Calculate mean precision for each fold
print(f"Precision Of LR: {statistics.mean(precision_scores) * 100} %")

# Calculate average recall
recall_scores = [i[1][1].mean() for i in LR_scores]  # Calculate mean recall for each fold
print(f"Recall Of LR: {statistics.mean(recall_scores) * 100} %")

# Calculate average F1-score
f1_scores = [i[1][2].mean() for i in LR_scores]  # Calculate mean F1-score for each fold
print(f"F1_score Of LR: {statistics.mean(f1_scores) * 100} %")


Average Accuracy Of LR: 87.24756167527252 %
Precision Of LR: 86.74875990936081 %
Recall Of LR: 87.58556171279714 %
F1_score Of LR: 86.87068452712869 %


In [56]:
import statistics

# Calculate average accuracy
print(f"Average Accuracy Of SVM: {statistics.mean([i[0] for i in SVM_scores])} %")

# Calculate average precision
precision_scores = [i[1][0].mean() for i in SVM_scores]  # Calculate mean precision for each fold
print(f"Precision Of SVM: {statistics.mean(precision_scores) * 100} %")

# Calculate average recall
recall_scores = [i[1][1].mean() for i in SVM_scores]  # Calculate mean recall for each fold
print(f"Recall Of SVM: {statistics.mean(recall_scores) * 100} %")

# Calculate average F1-score
f1_scores = [i[1][2].mean() for i in SVM_scores]  # Calculate mean F1-score for each fold
print(f"F1_score Of SVM: {statistics.mean(f1_scores) * 100} %")


Average Accuracy Of SVM: 74.14371772805508 %
Precision Of SVM: 73.98331785362224 %
Recall Of SVM: 72.99328736754177 %
F1_score Of SVM: 72.88176731514565 %


In [30]:
import pickle


In [31]:
with open('binary_classifier.pkl','wb') as f:
    pickle.dump(glioma_grade_Logistic,f)

In [32]:
input_data = pd.DataFrame([[
    0,         # Gender: 0 (Male)
    51.30,     # Age_at_diagnosis
    1,         # IDH1
    0,         # TP53
    0,         # ATRX
    0,         # PTEN
    0,         # EGFR
    0,         # CIC
    0,         # MUC16
    1,         # PIK3CA
    0,         # NF1
    0,         # PIK3R1
    1,         # FUBP1
    0,         # RB1
    0,         # NOTCH1
    0,         # BCOR
    0,         # CSMD3
    0,         # SMARCA4
    0,         # GRIN2A
    0,         # IDH2
    0,         # FAT4
    0          # PDGFRA
]], columns=[
    "Gender", "Age_at_diagnosis", "IDH1", "TP53", "ATRX", "PTEN", "EGFR", 
    "CIC", "MUC16", "PIK3CA", "NF1", "PIK3R1", "FUBP1", "RB1", "NOTCH1", 
    "BCOR", "CSMD3", "SMARCA4", "GRIN2A", "IDH2", "FAT4", "PDGFRA"
])


In [35]:
print(glioma_grade_Logistic.predict_proba(input_data))

[[0.94724338 0.05275662]]


In [34]:
print("Classes in the logistic regression model:", glioma_grade_Logistic.classes_)

Classes in the logistic regression model: [0 1]
