In [1]:
import os
import pandas as pd
import numpy as np
from combat.pycombat import pycombat
from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import joblib

In [2]:
os.chdir("../../Dataset/Merged")

In [3]:
dataset = pd.read_csv('MergedDataset.csv', index_col=0)

In [4]:
sampleID = dataset['SampleID']
datasetID = dataset['SampleID'].apply(lambda x: x.split('-')[0]).values
indicator = dataset['Label']
dataset = dataset.drop(columns=['SampleID', 'Label'])

dataset = pycombat(dataset.transpose(), datasetID).transpose()
dataset.insert(0, 'SampleID', sampleID)
dataset.insert(1, 'Label', indicator)

Found 7 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data


In [5]:
def getPatientID(sampleID):
    return sampleID.split('-')[0] + '-' + sampleID.split('-')[1].split('_', 1)[1]

dataset.insert(1, 'PatientID', dataset['SampleID'].apply(getPatientID))

print(dataset)

gruppi = dataset.groupby('PatientID')

def sanity_check(gruppi):
    for group_name, group_data in gruppi:
        if 'Control' in group_data['SampleID'].iloc[0]:
            for e in group_data['SampleID']:
                if not 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break
        else:
            for e in group_data['SampleID']:
                if 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break

sanity_check(gruppi)

                                         SampleID      PatientID  Label  \
0                        0-GSM1026056_600009.0001  0-600009.0001      1   
1             0-GSM1026057_600009.0001-FollowUp_1  0-600009.0001      1   
2                         0-GSM1026058_41461.0001   0-41461.0001      1   
3                         0-GSM1026059_41462.0001   0-41462.0001      1   
4                        0-GSM1026060_600029.0001  0-600029.0001      1   
...                                           ...            ...    ...   
2108  6-GSM2347715_NT142_W18D2-Control-FollowUp_1  6-NT142_W18D2      0   
2109  6-GSM2347717_NT041_W18D2-Control-FollowUp_2  6-NT041_W18D2      0   
2110  6-GSM2347719_NT142_W18D2-Control-FollowUp_2  6-NT142_W18D2      0   
2111  6-GSM2347721_NT041_W18D2-Control-FollowUp_3  6-NT041_W18D2      0   
2112  6-GSM2347723_NT142_W18D2-Control-FollowUp_3  6-NT142_W18D2      0   

       SEC14L1     YIPF5    SLC1A5        C2      NOL6      TPM3    PSMD11  \
0     8.196784  7.331

In [6]:
#n_splits number of re-shuffling & splitting iterations.
#test_size represents the proportion of the dataset to include in the test split.
#random_state is the seed used by the random number generator.
splitter = GroupShuffleSplit(n_splits=2, test_size=0.25, random_state = 42)
split = splitter.split(dataset, groups=dataset['PatientID'])
train_inds, test_inds = next(split)

train = dataset.iloc[train_inds].sample(frac=1, random_state=42)
test = dataset.iloc[test_inds].sample(frac=1, random_state=42)

print("Dataset di train:")
print(train.shape)
print("I malati sono: ", sum(train['Label'] == 1))
print("I sani sono: ", sum(train['Label'] == 0))

print("\nDataset di test:")
print(test.shape)
print("I malati sono: ", sum(test['Label'] == 1))
print("I sani sono: ", sum(test['Label'] == 0))

#malati train 45.12 perc
#malati test 42.49 perc
#malati sul totale 44.49 perc

y_train = train['Label']
x_train = train.drop(columns=['SampleID', 'Label', 'PatientID'])

y_test = test['Label']
x_test = test.drop(columns=['SampleID', 'Label', 'PatientID'])

Dataset di train:
(1593, 5588)
I malati sono:  695
I sani sono:  898

Dataset di test:
(520, 5588)
I malati sono:  245
I sani sono:  275


In [7]:
def trainModel(model, hyperparameters, x_train, y_train, printTrain, njobs): 
    pipeline = Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', model)])
    gridSearch = GridSearchCV(pipeline, param_grid=hyperparameters, cv=5, return_train_score=True, refit=True, n_jobs=njobs, verbose=10, error_score='raise') if printTrain == True else GridSearchCV(pipeline, param_grid=hyperparameters, cv=5, return_train_score=False, refit=True, n_jobs=njobs, verbose=10, error_score='raise')
    gridSearch.fit(x_train, y_train)
    print("Best model:", gridSearch.best_estimator_, gridSearch.best_params_)
    print("Best score:", np.max(gridSearch.cv_results_['mean_test_score']))
    print("All scores:", gridSearch.cv_results_['mean_test_score'])
    return gridSearch

In [None]:
svc = trainModel(SVC(), {
    'classifier__kernel': ['linear', 'poly', 'rbf'], 
    'classifier__probability': [True],
    'classifier__C': [2**-5, 2**-3, 2**-1, 2**0, 2**1, 2**3, 2**7, 2**9, 2**11, 2**13, 2**15],
    'classifier__gamma': [2**-15, 2**-13, 2**-11, 2**-9, 2**-7, 2**-5, 2**-3, 2**-1, 2**1, 2**3, 2**5, 'scale', 'auto']}, x_train, y_train, False, -1)

In [8]:
svc = joblib.load('../../Modelli/DatasetFull/svc.pkl')
print("Best model:", svc.best_estimator_, svc.best_params_)
print("Best score:", np.max(svc.cv_results_['mean_test_score']))
print("All scores:", svc.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier', SVC(C=128, gamma=0.0078125))]) {'classifier__C': 128, 'classifier__gamma': 0.0078125, 'classifier__kernel': 'rbf'}
Best score: 0.9447526665483725
All scores: [0.5813036  0.56371523 0.56371523 0.5813036  0.56371523 0.56371523
 0.5813036  0.56371523 0.56371523 0.5813036  0.56371523 0.56371523
 0.5813036  0.85562587 0.56371523 0.5813036  0.85939552 0.56371523
 0.5813036  0.85939552 0.56371523 0.5813036  0.85939552 0.56371523
 0.5813036  0.85939552 0.56371523 0.5813036  0.85939552 0.56371523
 0.5813036  0.85939552 0.56371523 0.5813036  0.86002248 0.56371523
 0.5813036  0.56371523 0.56371523 0.56180675 0.56371523 0.56371523
 0.56180675 0.56371523 0.56371523 0.56180675 0.56371523 0.56371523
 0.56180675 0.57753002 0.56371523 0.56180675 0.85939552 0.56371523
 0.56180675 0.85939552 0.78465724 0.56180675 0.85939552 0.56371523
 0.56180675 0.85939552 0.56371523 0.56180675 0.85939552 0.56371523
 0.56180675 0.859

In [None]:
randomForest = trainModel(RandomForestClassifier(), {
    'classifier__n_estimators': [i for i in range(150, 251, 25)],
    'classifier__max_depth': [i for i in range(4, 13)]}, x_train, y_train, True, -1)

In [9]:
randomForest = joblib.load('../../Modelli/DatasetFull/randomForest.pkl')
print("Best model:", randomForest.best_estimator_, randomForest.best_params_)
print("Best score:", np.max(randomForest.cv_results_['mean_test_score']))
print("All scores:", randomForest.cv_results_['mean_test_score'])

y_pred = randomForest.predict(x_test)
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=8, n_estimators=175))]) {'classifier__max_depth': 8, 'classifier__n_estimators': 175}
Best score: 0.9384791309319611
All scores: [0.91964669 0.9209085  0.91399815 0.92153546 0.92154334 0.92216833
 0.92153152 0.92592812 0.92404921 0.92153546 0.92843595 0.92780702
 0.92781491 0.92654522 0.92844384 0.92969579 0.92404921 0.93220757
 0.92906883 0.92781688 0.93282664 0.93847913 0.92968987 0.93157666
 0.93283847 0.93157864 0.93094773 0.92593009 0.9290708  0.9284399
 0.93157469 0.9347154  0.93408253 0.93597129 0.92906883 0.92906488
 0.93095365 0.92717809 0.93032275 0.93471146 0.92969382 0.93409042
 0.93220757 0.93220362 0.93408253]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.91      0.95       275
           1       0.91      0.98      0.94       245

    accuracy                           0.9

In [None]:
elasticNet = trainModel(LogisticRegression(), {
    'classifier__C': [2**-7, 2**-5, 2**-3, 2**-1, 2**0, 2**1, 2**3, 2**7, 2**9, 2**11],
    'classifier__penalty': ['elasticnet'],
    'classifier__l1_ratio': [0, 0.0001, 0.1, 0.25, 0.5, 0.75, 1],
    'classifier__solver': ['saga']}, x_train, y_train, True, -1)

In [10]:
elasticNet = joblib.load('../../Modelli/DatasetFull/elasticNet.pkl')
print("Best model:", elasticNet.best_estimator_, elasticNet.best_params_)
print("Best score:", np.max(elasticNet.cv_results_['mean_test_score']))
print("All scores:", elasticNet.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier',
                 LogisticRegression(C=0.5, l1_ratio=1, penalty='elasticnet',
                                    solver='saga'))]) {'classifier__C': 0.5, 'classifier__l1_ratio': 1, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
Best score: 0.7061907296780425
All scores: [0.55494568 0.5561996  0.56371523 0.56371523 0.56371523 0.56371523
 0.56371523 0.54801759 0.54801759 0.57501429 0.5655961  0.56371523
 0.56371523 0.56371523 0.56620532 0.56494943 0.62899194 0.68548136
 0.64027326 0.56997496 0.55616609 0.5611995  0.56245736 0.58442263
 0.62021056 0.6610122  0.68736224 0.70619073 0.56182843 0.56057452
 0.57061178 0.59007512 0.62083555 0.64281067 0.66352004 0.55931863
 0.56120147 0.56559413 0.57249463 0.58944816 0.60703259 0.62398218
 0.56120147 0.55994756 0.56371128 0.56496914 0.56747895 0.57187161
 0.57374855 0.56246131 0.55868969 0.55868969 0.56057649 0.56057254
 0.56120739 0.5624

In [None]:
knn = trainModel(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'classifier__weights': ['uniform', 'distance'] }, x_train, y_train, True, 3)

In [11]:
knn = joblib.load('../../Modelli/DatasetFull/knn.pkl')
print("Best model:", knn.best_estimator_, knn.best_params_)
print("Best score:", np.max(knn.cv_results_['mean_test_score']))
print("All scores:", knn.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier', KNeighborsClassifier(n_neighbors=3))]) {'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}
Best score: 0.8920328857869521
All scores: [0.89203289 0.89203289 0.87570237 0.87570237 0.84555904 0.84555904
 0.81730841 0.81730841 0.78278031 0.78340924 0.75578557 0.7601802
 0.73319138 0.73883796]


In [None]:
HistGradientBoosting = trainModel(HistGradientBoostingClassifier(), {
    'classifier__max_depth': [3, 5, 7, 9, 11],
    'classifier__max_iter': [100, 150, 200, 250],
    'classifier__learning_rate': [0.1, 0.01, 0.001]}, x_train, y_train, False, 4)

In [12]:
HistGradientBoosting = joblib.load('../../Modelli/DatasetFull/HistGradientBoostingWithFold.pkl')
print("Best model:", HistGradientBoosting.best_estimator_, HistGradientBoosting.best_params_)
print("Best score:", np.max(HistGradientBoosting.cv_results_['mean_test_score']))
print("All scores:", HistGradientBoosting.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier',
                 HistGradientBoostingClassifier(max_depth=9, max_iter=200))]) {'classifier__learning_rate': 0.1, 'classifier__max_depth': 9, 'classifier__max_iter': 200}
Best score: 0.9491433528518758
All scores: [0.94099091 0.94789338 0.9453816  0.94476055 0.94664143 0.94726839
 0.9453816  0.94537765 0.94789535 0.94537568 0.94600659 0.94914335
 0.90333984 0.91338105 0.92154137 0.91211924 0.92279135 0.93157469
 0.90396088 0.91964867 0.92968987 0.90458784 0.9202776  0.93094773
 0.56371523 0.66482325 0.83742237 0.56371523 0.82926007 0.84871552
 0.56371523 0.82110566 0.84997536 0.56371523 0.82110566 0.84997536]


In [None]:
GradientBoosting = trainModel(GradientBoostingClassifier(), {
    'classifier__max_depth': [3, 5, 7, 9, 11],
    'classifier__max_iter': [100, 150, 200, 250],
    'classifier__learning_rate': [0.1, 0.01, 0.001]}, x_train, y_train, False, 4)

In [23]:
GradientBoosting = joblib.load('../../Modelli/DatasetFull/GradientBoostingWithFold.pkl')
print("Best params: ", GradientBoosting)
print("Best score: ",GradientBoosting.score(x_train, y_train))

Best params:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Best score:  1.0


In [None]:
naiveBayes = Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', GaussianNB())])
print(naiveBayes.fit(x_train, y_train).score(x_train, y_train))

In [15]:
naiveBayes = joblib.load('../../Modelli/DatasetFull/naiveBayes.pkl')
print(naiveBayes.score(x_train, y_train))

0.9152542372881356


Performance dei vari modelli

In [24]:
def prettyPrint(model, name):
    if isinstance(model, Pipeline):
        print(name + ":")
        print("Iperparametri: ", model)
        print("Training accuracy: ", model.score(x_train, y_train))
        print("Test accuracy: ", model.score(x_test, y_test), "\n")
        print(classification_report(y_test, model.predict(x_test)), "\n\n")
    else:
        print(name + ":")
        print("Iperparametri: ", model.best_params_)
        print("Training accuracy: ", model.best_score_)
        print("Test accuracy: ", model.score(x_test, y_test), "\n")
        print(classification_report(y_test, model.predict(x_test)), "\n\n")

for model, name in [(svc, "SVC"), (randomForest, "Random Forest"), (elasticNet, "Elastic Net"), (knn, "KNN"), (HistGradientBoosting, "Hist Gradient Boosting"), (GradientBoosting, "Gradient Boosting"), (naiveBayes, "Naive Bayes")]:
    prettyPrint(model, name)

SVC:
Iperparametri:  {'classifier__C': 128, 'classifier__gamma': 0.0078125, 'classifier__kernel': 'rbf'}
Training accuracy:  0.9447526665483725
Test accuracy:  0.95 

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       275
           1       0.94      0.96      0.95       245

    accuracy                           0.95       520
   macro avg       0.95      0.95      0.95       520
weighted avg       0.95      0.95      0.95       520
 


Random Forest:
Iperparametri:  {'classifier__max_depth': 8, 'classifier__n_estimators': 175}
Training accuracy:  0.9384791309319611
Test accuracy:  0.9442307692307692 

              precision    recall  f1-score   support

           0       0.98      0.91      0.95       275
           1       0.91      0.98      0.94       245

    accuracy                           0.94       520
   macro avg       0.94      0.95      0.94       520
weighted avg       0.95      0.94      0.94       520
 


Elas