In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import seaborn as sns
import pandas as pd

In [4]:
# Loading the dataset
glioma_df = pd.read_csv("./TCGA_InfoWithGrade.csv")

In [5]:
glioma_df.nunique()

Unnamed: 0           839
Gender                 2
Age_at_diagnosis     766
IDH1                   2
TP53                   2
ATRX                   2
PTEN                   2
EGFR                   2
CIC                    2
MUC16                  2
PIK3CA                 2
NF1                    2
PIK3R1                 2
FUBP1                  2
RB1                    2
NOTCH1                 2
BCOR                   2
CSMD3                  2
SMARCA4                2
GRIN2A                 2
IDH2                   2
FAT4                   2
PDGFRA                 2
Primary Diagnosis      6
Grade                  2
dtype: int64

In [6]:
glioma_df = glioma_df.drop("Unnamed: 0", axis=1)
glioma_df.head()

Unnamed: 0,Gender,Age_at_diagnosis,IDH1,TP53,ATRX,PTEN,EGFR,CIC,MUC16,PIK3CA,NF1,PIK3R1,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA,Primary Diagnosis,Grade
0,0,51.3,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,4,0
1,0,38.72,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0
2,0,35.17,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,32.78,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0
4,0,31.51,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [7]:
# Using k = 10 as recommended by dataset authors
kfold = KFold(n_splits=10, shuffle=True)

In [8]:
feature_df = glioma_df[glioma_df.columns[:-2]]
multi_class_df = glioma_df["Primary Diagnosis"]
binary_class_df = glioma_df["Grade"]

In [9]:
feature_df.head()

Unnamed: 0,Gender,Age_at_diagnosis,IDH1,TP53,ATRX,PTEN,EGFR,CIC,MUC16,PIK3CA,NF1,PIK3R1,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,51.3,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1,0,38.72,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,35.17,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,32.78,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0
4,0,31.51,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [175]:
multi_class_df.shape

(839,)

In [176]:
binary_class_df.shape

(839,)

In [177]:
kfold.get_n_splits(feature_df)

10

In [178]:
print(kfold)

KFold(n_splits=10, random_state=None, shuffle=True)


In [None]:
print([(i,j) for i,j in kfold.split(feature_df)])

In [180]:
#initializing Multi Layer Perceptron model with parameters
glioma_classifier_MLP = MLPClassifier(hidden_layer_sizes=(44, 44, 44,44), activation="relu", solver="adam",learning_rate="invscaling", verbose=True, max_iter=500)
glioma_classifier_RF = RandomForestClassifier(n_estimators=2000, criterion="gini", verbose=True)
glioma_classifier_ada = AdaBoostClassifier(n_estimators=15)

In [None]:
MLP_scores = []
RF_scores = []
ADA_scores = []
Fold_No = 1
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    
    # Multi-Layered-Perceptron
    glioma_classifier_MLP.fit(feature_train, label_train)
    predictions = glioma_classifier_MLP.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    MLP_scores.append((accuracy_pct, other_metrics))
    
    # Random Forest
    glioma_classifier_RF.fit(feature_train, label_train)
    predictions = glioma_classifier_RF.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    RF_scores.append((accuracy_pct, other_metrics))

    # Ada Boost
    glioma_classifier_ada.fit(feature_train, label_train)
    predictions = glioma_classifier_ada.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    ADA_scores.append((accuracy_pct, other_metrics))

In [183]:
print(f"Average Accuracy Of MLP: {(sum([i[0] for i in MLP_scores])/ len(MLP_scores))*100}%")
print(f"Average Accuracy Of Random Forest: {(sum(i[0] for i in RF_scores)/ len(RF_scores))*100}%")
print(f"Average Accuracy Of AdaBoost: {(sum([i[0] for i in ADA_scores])/ len(ADA_scores))*100}%")


Average Accuracy Of MLP: 56.50889271371199%
Average Accuracy Of Random Forest: 48.75645438898451%
Average Accuracy Of AdaBoost: 54.84366035570856%


## Using KNN and SVM

In [184]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [185]:
glioma_classifier_SVM = SVC(C=0.70, kernel="poly",degree=3, verbose=True)
glioma_classifier_KNN = KNeighborsClassifier(n_neighbors=18, weights="distance")
glioma_classifier_KNN

In [None]:
SVM_scores, KNN_Scores = [],[]
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    # SVM
    glioma_classifier_SVM.fit(feature_train, label_train)
    predictions = glioma_classifier_SVM.predict(feature_test)
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    SVM_scores.append((accuracy_pct, other_metrics))
    print("starting KNN")
    # KNN
    glioma_classifier_KNN.fit(feature_train, label_train)
    predictions = glioma_classifier_KNN.predict(feature_test)
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    KNN_Scores.append((accuracy_pct, other_metrics))

In [187]:
print(f"Average Accuracy Of SVM: {(sum([i[0] for i in SVM_scores])/ len(SVM_scores))*100}%")
print(f"Average Accuracy Of KNN: {(sum(i[0] for i in KNN_Scores)/ len(KNN_Scores))*100}%")


Average Accuracy Of SVM: 47.307802639127935%
Average Accuracy Of KNN: 47.90877796901893%


# fine tuning MLP due to highest accuracy

In [188]:
from sklearn.model_selection import GridSearchCV, train_test_split

In [198]:
param_grid = {
    'hidden_layer_sizes': [(24,24,24), (30, 30, 30), (40, 40, 40), (40, 40, 40, 40),(43,44, 45, 46)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.0005, 0.00095, 0.001, 0.0025, 0.005],
    'learning_rate_init': [0.001, 0.005,0.0075, 0.01, 0.025, 0.05],
    'batch_size': ['auto', 16, 32],
}
glioma_classifier_MLP_finetuned = MLPClassifier(max_iter=600, random_state=42)

In [199]:
gridsearch = GridSearchCV(estimator=glioma_classifier_MLP_finetuned, param_grid=param_grid, scoring="accuracy", cv=5, n_jobs=-1)

In [None]:
    feature_train, feature_test, label_train, label_test = train_test_split(feature_df, multi_class_df, shuffle=True, test_size=0.2)
    # Multi-Layered-Perceptron
    gridsearch.fit(feature_train, label_train)

""" 
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    glioma_classifier_MLP.fit()
    predictions = glioma_classifier_MLP.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    MLP_scores.append((accuracy_pct, other_metrics))
"""

In [201]:
best_glioma_mlp = gridsearch.best_estimator_

In [202]:
MLP_scores = []
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = multi_class_df.iloc[train_indices], multi_class_df.iloc[test_indices]
    best_glioma_mlp.fit(feature_train, label_train)
    predictions = best_glioma_mlp.predict(feature_test) 
    accuracy_pct = accuracy_score(label_test, predictions)
    other_metrics = precision_recall_fscore_support(label_test, predictions, average="micro")
    MLP_scores.append((accuracy_pct, other_metrics))



In [203]:
print(f"Average Accuracy: {(sum(i[0] for i in MLP_scores)/ len(MLP_scores))*100}%")

Average Accuracy: 55.05880665519219%


In [204]:
#Saving model

In [205]:
import pickle
with open('model.pkl','wb') as f:
    pickle.dump(best_glioma_mlp,f)

In [21]:
# testing Binary classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [22]:
glioma_grade_Logistic = LogisticRegression(max_iter=500)
glioma_grade_SVM = SVC()

In [25]:
LR_scores = []
SVM_scores = []
for train_indices, test_indices in kfold.split(feature_df):
    feature_train, feature_test = feature_df.iloc[train_indices], feature_df.iloc[test_indices]
    label_train, label_test = binary_class_df.iloc[train_indices], binary_class_df.iloc[test_indices]
    glioma_grade_Logistic.fit(feature_train, label_train)
    predictions = glioma_grade_Logistic.predict(feature_test)
    LR_scores.append(accuracy_score(label_test, predictions)*100)
    
    glioma_grade_SVM.fit(feature_train, label_train)
    predictions = glioma_grade_SVM.predict(feature_test)
    SVM_scores.append(accuracy_score(label_test, predictions)*100)



In [26]:
print(f"Average Accuracy Of LR:{sum(LR_scores)/len(LR_scores)}%")
print(f"Maximum Accuracy Of LR: {max(LR_scores)}%")

Average Accuracy Of LR:86.89041881812966%
Maximum Accuracy Of LR: 91.66666666666666%


In [27]:
print(f"Average Accuracy Of SVM:{sum(SVM_scores)/len(LR_scores)}%")
print(f"Maximum Accuracy Of SVM: {max(SVM_scores)}%")

Average Accuracy Of SVM:74.49225473321859%
Maximum Accuracy Of SVM: 83.33333333333334%


In [28]:
import pickle


In [29]:
with open('binary_classifier.pkl','wb') as f:
    pickle.dump(glioma_grade_Logistic,f)

In [30]:
input_data = pd.DataFrame([[
    0,         # Gender: 0 (Male)
    51.30,     # Age_at_diagnosis
    1,         # IDH1
    0,         # TP53
    0,         # ATRX
    0,         # PTEN
    0,         # EGFR
    0,         # CIC
    0,         # MUC16
    1,         # PIK3CA
    0,         # NF1
    0,         # PIK3R1
    1,         # FUBP1
    0,         # RB1
    0,         # NOTCH1
    0,         # BCOR
    0,         # CSMD3
    0,         # SMARCA4
    0,         # GRIN2A
    0,         # IDH2
    0,         # FAT4
    0          # PDGFRA
]], columns=[
    "Gender", "Age_at_diagnosis", "IDH1", "TP53", "ATRX", "PTEN", "EGFR", 
    "CIC", "MUC16", "PIK3CA", "NF1", "PIK3R1", "FUBP1", "RB1", "NOTCH1", 
    "BCOR", "CSMD3", "SMARCA4", "GRIN2A", "IDH2", "FAT4", "PDGFRA"
])


In [32]:
print(glioma_grade_Logistic.predict(input_data))

[0]


In [34]:
print("Classes in the logistic regression model:", glioma_grade_Logistic.classes_)

Classes in the logistic regression model: [0 1]
