In [1]:
import os
import pandas as pd
import numpy as np
from combat.pycombat import pycombat
from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import joblib

In [2]:
os.chdir("../../Dataset/Merged")

In [3]:
dataset = pd.read_csv('MergedDatasetZeroes.csv', index_col=0)

In [4]:
sampleID = dataset['SampleID']
datasetID = dataset['SampleID'].apply(lambda x: x.split('-')[0]).values
indicator = dataset['Label']
dataset = dataset.drop(columns=['SampleID', 'Label'])

dataset = pycombat(dataset.transpose(), datasetID).transpose()

dataset.insert(0, 'SampleID', sampleID)
dataset.insert(1, 'Label', indicator)

Found 7 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.


  np.absolute(d_new-d_old)/d_old))  # maximum difference between new and old estimate


Adjusting the Data


In [5]:
def getPatientID(sampleID):
    return sampleID.split('-')[0] + '-' + sampleID.split('-')[1].split('_', 1)[1]

dataset.insert(1, 'PatientID', dataset['SampleID'].apply(getPatientID))

print(dataset)

gruppi = dataset.groupby('PatientID')

def sanity_check(gruppi):
    for group_name, group_data in gruppi:
        if 'Control' in group_data['SampleID'].iloc[0]:
            for e in group_data['SampleID']:
                if not 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break
        else:
            for e in group_data['SampleID']:
                if 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break

sanity_check(gruppi)

                                         SampleID      PatientID  Label  \
0                        0-GSM1026056_600009.0001  0-600009.0001      1   
1             0-GSM1026057_600009.0001-FollowUp_1  0-600009.0001      1   
2                         0-GSM1026058_41461.0001   0-41461.0001      1   
3                         0-GSM1026059_41462.0001   0-41462.0001      1   
4                        0-GSM1026060_600029.0001  0-600029.0001      1   
...                                           ...            ...    ...   
2108  6-GSM2347715_NT142_W18D2-Control-FollowUp_1  6-NT142_W18D2      0   
2109  6-GSM2347717_NT041_W18D2-Control-FollowUp_2  6-NT041_W18D2      0   
2110  6-GSM2347719_NT142_W18D2-Control-FollowUp_2  6-NT142_W18D2      0   
2111  6-GSM2347721_NT041_W18D2-Control-FollowUp_3  6-NT041_W18D2      0   
2112  6-GSM2347723_NT142_W18D2-Control-FollowUp_3  6-NT142_W18D2      0   

        STEEP1   SEC14L1     YIPF5    SLC1A5        C2      NOL6     SPRR3  \
0     4.370838  8.721

In [6]:
#n_splits number of re-shuffling & splitting iterations.
#test_size represents the proportion of the dataset to include in the test split.
#random_state is the seed used by the random number generator.
splitter = GroupShuffleSplit(n_splits=2, test_size=0.25, random_state = 42)
split = splitter.split(dataset, groups=dataset['PatientID'])
train_inds, test_inds = next(split)

train = dataset.iloc[train_inds].sample(frac=1, random_state=42)
test = dataset.iloc[test_inds].sample(frac=1, random_state=42)

print("Dataset di train:")
print(train.shape)
print("I malati sono: ", sum(train['Label'] == 1))
print("I sani sono: ", sum(train['Label'] == 0))

print("\nDataset di test:")
print(test.shape)
print("I malati sono: ", sum(test['Label'] == 1))
print("I sani sono: ", sum(test['Label'] == 0))

#malati train 45.12 perc
#malati test 42.49 perc
#malati sul totale 44.49 perc

y_train = train['Label']
x_train = train.drop(columns=['SampleID', 'Label', 'PatientID'])

y_test = test['Label']
x_test = test.drop(columns=['SampleID', 'Label', 'PatientID'])

Dataset di train:
(1593, 12094)
I malati sono:  695
I sani sono:  898

Dataset di test:
(520, 12094)
I malati sono:  245
I sani sono:  275


In [8]:
def trainModel(model, hyperparameters, x_train, y_train, printTrain, njobs): 
    pipeline = Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', model)])
    gridSearch = GridSearchCV(pipeline, param_grid=hyperparameters, cv=5, return_train_score=True, refit=True, n_jobs=njobs, verbose=10, error_score='raise') if printTrain == True else GridSearchCV(pipeline, param_grid=hyperparameters, cv=5, return_train_score=False, refit=True, n_jobs=njobs, verbose=10, error_score='raise')
    gridSearch.fit(x_train, y_train)
    print("Best model:", gridSearch.best_estimator_, gridSearch.best_params_)
    print("Best score:", np.max(gridSearch.cv_results_['mean_test_score']))
    print("All scores:", gridSearch.cv_results_['mean_test_score'])
    return gridSearch

In [None]:
svc = trainModel(SVC(), {
    'classifier__kernel': ['linear', 'poly', 'rbf'], 
    'classifier__probability': [True],
    'classifier__C': [2**-5, 2**-3, 2**-1, 2**0, 2**1, 2**3, 2**7, 2**9, 2**11, 2**13, 2**15],
    'classifier__gamma': [2**-15, 2**-13, 2**-11, 2**-9, 2**-7, 2**-5, 2**-3, 2**-1, 2**1, 2**3, 2**5, 'scale', 'auto']}, x_train, y_train, False, -1)

In [7]:
svc = joblib.load('../../Modelli/DatasetZeroes/svc.pkl')
print("Best model:", svc.best_estimator_, svc.best_params_)
print("Best score:", np.max(svc.cv_results_['mean_test_score']))
print("All scores:", svc.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier', SVC(C=8, gamma=0.0078125, probability=True))]) {'classifier__C': 8, 'classifier__gamma': 0.0078125, 'classifier__kernel': 'rbf', 'classifier__probability': True}
Best score: 0.9278011080223182
All scores: [0.48842886 0.56371523 0.56371523 0.48842886 0.56371523 0.56371523
 0.48842886 0.56371523 0.56371523 0.48842886 0.62397035 0.56371523
 0.48842886 0.77840934 0.56371523 0.48842886 0.74450425 0.64468563
 0.48842886 0.74450425 0.64280081 0.48842886 0.74450425 0.56371523
 0.48842886 0.74450425 0.56371523 0.48842886 0.74450425 0.56371523
 0.48842886 0.74450425 0.56371523 0.48842886 0.81294139 0.56371523
 0.48842886 0.56371523 0.56371523 0.45385146 0.56371523 0.56371523
 0.45385146 0.56371523 0.56371523 0.45385146 0.56371523 0.56371523
 0.45385146 0.76209854 0.56371523 0.45385146 0.74889691 0.57689517
 0.45385146 0.74450425 0.64280673 0.45385146 0.74450425 0.64468563
 0.45385146 0.74450425 0.57688926 0.4

In [None]:
randomForest = trainModel(RandomForestClassifier(), {
    'classifier__n_estimators': [i for i in range(150, 251, 25)],
    'classifier__max_depth': [i for i in range(4, 13)]}, x_train, y_train, True, -1)

In [8]:
randomForest = joblib.load('../../Modelli/DatasetZeroes/randomForest.pkl')
print("Best model:", randomForest.best_estimator_, randomForest.best_params_)
print("Best score:", np.max(randomForest.cv_results_['mean_test_score']))
print("All scores:", randomForest.cv_results_['mean_test_score'])

y_pred = randomForest.predict(x_test)
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=12, n_estimators=225))]) {'classifier__max_depth': 12, 'classifier__n_estimators': 225}
Best score: 0.935338419983833
All scores: [0.92340845 0.92215059 0.9246604  0.92026774 0.92215059 0.92215059
 0.92277755 0.92277952 0.92152166 0.92026774 0.92403147 0.92277755
 0.92340648 0.92591629 0.92277755 0.92341239 0.92654325 0.92780111
 0.92654127 0.9246604  0.9284261  0.92905305 0.92716823 0.92968396
 0.92403344 0.92968001 0.92842807 0.92717415 0.92779716 0.93156878
 0.93031486 0.92968987 0.93156878 0.92843595 0.93407662 0.92843398
 0.93031092 0.93219771 0.93408253 0.93345557 0.93407464 0.929057
 0.93094773 0.93533842 0.93156878]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       275
           1       0.90      0.98      0.94       245

    accuracy                           0.9

In [None]:
elasticNet = trainModel(LogisticRegression(), {
    'classifier__C': [2**-7, 2**-5, 2**-3, 2**-1, 2**0, 2**1, 2**3, 2**7, 2**9, 2**11],
    'classifier__penalty': ['elasticnet'],
    'classifier__l1_ratio': [0, 0.0001, 0.1, 0.25, 0.5, 0.75, 1],
    'classifier__solver': ['saga']}, x_train, y_train, True, -1)

In [9]:
elasticNet = joblib.load('../../Modelli/DatasetZeroes/elasticNet.pkl')
print("Best model:", elasticNet.best_estimator_, elasticNet.best_params_)
print("Best score:", np.max(elasticNet.cv_results_['mean_test_score']))
print("All scores:", elasticNet.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier',
                 LogisticRegression(C=0.5, l1_ratio=1, penalty='elasticnet',
                                    solver='saga'))]) {'classifier__C': 0.5, 'classifier__l1_ratio': 1, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}
Best score: 0.6296267818063525
All scores: [0.51917943 0.52106031 0.56371523 0.56371523 0.56371523 0.56371523
 0.56371523 0.47145167 0.47019578 0.57438733 0.56371523 0.56371523
 0.56371523 0.56371523 0.45763096 0.45825595 0.54865243 0.6233493
 0.60137813 0.56497309 0.56371523 0.45888094 0.46013683 0.47395359
 0.51664005 0.56624081 0.60703259 0.62962678 0.45700006 0.45888094
 0.46578932 0.48149287 0.51789594 0.54677353 0.57000848 0.45825595
 0.45825595 0.4639045  0.46641825 0.48211983 0.49593068 0.51789594
 0.45637507 0.45825398 0.45888291 0.46076379 0.46453146 0.46767217
 0.46704521 0.45825595 0.45825792 0.45888291 0.45762702 0.45700203
 0.46139272 0.45888

In [None]:
knn = trainModel(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'classifier__weights': ['uniform', 'distance'] }, x_train, y_train, True, 3)

In [10]:
knn = joblib.load('../../Modelli/DatasetZeroes/knn.pkl')
print("Best model:", knn.best_estimator_, knn.best_params_)
print("Best score:", np.max(knn.cv_results_['mean_test_score']))
print("All scores:", knn.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier', KNeighborsClassifier(n_neighbors=3))]) {'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}
Best score: 0.6691863330770292
All scores: [0.66918633 0.66918633 0.63715818 0.63715818 0.60953648 0.60953648
 0.57564125 0.57564125 0.55995347 0.55995347 0.54235918 0.54298811
 0.52666351 0.52666351]


In [None]:
HistGradientBoosting = trainModel(HistGradientBoostingClassifier(), {
    'classifier__max_depth': [3, 5, 7, 9, 11],
    'classifier__max_iter': [100, 150, 200, 250],
    'classifier__learning_rate': [0.1, 0.01, 0.001]}, x_train, y_train, False, 4)

In [11]:
HistGradientBoosting = joblib.load('../../Modelli/DatasetZeroes/HistGradientBoosting.pkl')
print("Best model:", HistGradientBoosting.best_estimator_, HistGradientBoosting.best_params_)
print("Best score:", np.max(HistGradientBoosting.cv_results_['mean_test_score']))
print("All scores:", HistGradientBoosting.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier',
                 HistGradientBoostingClassifier(max_depth=5, max_iter=200))]) {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__max_iter': 200}
Best score: 0.9510301453047061
All scores: [0.93722324 0.94161787 0.94789535 0.9491473  0.94977426 0.94977031
 0.95103015 0.95040121 0.94474872 0.94788746 0.94788943 0.94851639
 0.94600264 0.94725853 0.94851639 0.94789141 0.9472605  0.9466296
 0.94788943 0.9491473  0.92340648 0.92152954 0.92654916 0.9296879
 0.9328365  0.93973502 0.94099288 0.94099485 0.93033063 0.93221348
 0.93911003 0.93911003 0.93158455 0.93158455 0.93785611 0.93848307
 0.93033063 0.93158455 0.93722718 0.93722718 0.56371523 0.87761085
 0.8926559  0.89453875 0.56371523 0.8637862  0.88448769 0.89389996
 0.56371523 0.86941898 0.8882593  0.89642357 0.56371523 0.87004594
 0.88763037 0.89704856 0.56371523 0.86941898 0.88763037 0.89704856]


In [None]:
GradientBoosting = trainModel(GradientBoostingClassifier(), {
    'classifier__max_depth': [3, 5, 7, 9, 11],
    'classifier__max_iter': [100, 150, 200, 250],
    'classifier__learning_rate': [0.1, 0.01, 0.001]}, x_train, y_train, False, 4)

In [12]:
GradientBoosting = joblib.load('../../Modelli/DatasetZeroes/GradientBoosting.pkl')
print("Best model:", GradientBoosting.best_estimator_, GradientBoosting.best_params_)
print("Best score:", np.max(GradientBoosting.cv_results_['mean_test_score']))
print("All scores:", GradientBoosting.cv_results_['mean_test_score'])

Best model: Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('classifier',
                 GradientBoostingClassifier(max_depth=5, n_estimators=150))]) {'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__n_estimators': 150}
Best score: 0.9485242798840717
All scores: [0.93595552 0.93784231 0.94160801 0.94162576 0.94852428 0.94727233
 0.9303188  0.93597721 0.93596144 0.92466434 0.91963684 0.92152363
 0.93408253 0.93220362 0.93471343 0.92026774 0.92654719 0.93031683
 0.56371523 0.88011672 0.89892549 0.56371523 0.87760494 0.90834566
 0.56371523 0.90521875 0.90772658]


In [None]:
naiveBayes = Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', GaussianNB())])
print(naiveBayes.fit(x_train, y_train).score(x_train, y_train))

In [13]:
naiveBayes = joblib.load('../../Modelli/DatasetZeroes/naiveBayes.pkl')
print(naiveBayes.score(x_train, y_train))

0.9183929692404269


Performance dei vari modelli

In [14]:
def prettyPrint(model, name):
    if isinstance(model, Pipeline):
        print(name + ":")
        print("Iperparametri: ", model)
        print("Training accuracy: ", model.score(x_train, y_train))
        print("Test accuracy: ", model.score(x_test, y_test), "\n")
        print(classification_report(y_test, model.predict(x_test)), "\n\n")
    else:
        print(name + ":")
        print("Iperparametri: ", model.best_params_)
        print("Training accuracy: ", model.best_score_)
        print("Test accuracy: ", model.score(x_test, y_test), "\n")
        print(classification_report(y_test, model.predict(x_test)), "\n\n")

for model, name in [(svc, "SVC"), (randomForest, "Random Forest"), (elasticNet, "Elastic Net"), (knn, "KNN"), (HistGradientBoosting, "Hist Gradient Boosting"), (GradientBoosting, "Gradient Boosting"), (naiveBayes, "Naive Bayes")]:
    prettyPrint(model, name)

SVC:
Iperparametri:  {'classifier__C': 8, 'classifier__gamma': 0.0078125, 'classifier__kernel': 'rbf', 'classifier__probability': True}
Training accuracy:  0.9278011080223182
Test accuracy:  0.9461538461538461 

              precision    recall  f1-score   support

           0       0.97      0.92      0.95       275
           1       0.92      0.97      0.94       245

    accuracy                           0.95       520
   macro avg       0.95      0.95      0.95       520
weighted avg       0.95      0.95      0.95       520
 


Random Forest:
Iperparametri:  {'classifier__max_depth': 12, 'classifier__n_estimators': 225}
Training accuracy:  0.935338419983833
Test accuracy:  0.9365384615384615 

              precision    recall  f1-score   support

           0       0.98      0.90      0.94       275
           1       0.90      0.98      0.94       245

    accuracy                           0.94       520
   macro avg       0.94      0.94      0.94       520
weighted avg     