In [None]:
import os
import pandas as pd
import numpy as np
from combat.pycombat import pycombat
from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import joblib
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier

In [None]:
os.chdir("../Dataset/Merged")

Creazione Training e Test

In [None]:
dataset = pd.read_csv('MergedDataset.csv', index_col=0)

sampleID = dataset['SampleID']
datasetID = dataset['SampleID'].apply(lambda x: x.split('-')[0]).values
indicator = dataset['Label']
dataset = dataset.drop(columns=['SampleID', 'Label'])

dataset = pycombat(dataset.transpose(), datasetID).transpose()
dataset.insert(0, 'SampleID', sampleID)
dataset.insert(1, 'Label', indicator)

def getPatientID(sampleID):
    return sampleID.split('-')[0] + '-' + sampleID.split('-')[1].split('_', 1)[1]

dataset.insert(1, 'PatientID', dataset['SampleID'].apply(getPatientID))
print(dataset)

gruppi = dataset.groupby('PatientID')

def sanity_check(gruppi):
    for group_name, group_data in gruppi:
        if 'Control' in group_data['SampleID'].iloc[0]:
            for e in group_data['SampleID']:
                if not 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break
        else:
            for e in group_data['SampleID']:
                if 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break

sanity_check(gruppi)

splitter = GroupShuffleSplit(n_splits=2, test_size=0.25, random_state = 42)
split = splitter.split(dataset, groups=dataset['PatientID'])
train_inds, test_inds = next(split)

train = dataset.iloc[train_inds].sample(frac=1, random_state=42)
test = dataset.iloc[test_inds].sample(frac=1, random_state=42)

print("Dataset di train:")
print(train.shape)
print("I malati sono: ", sum(train['Label'] == 1))
print("I sani sono: ", sum(train['Label'] == 0))

print("\nDataset di test:")
print(test.shape)
print("I malati sono: ", sum(test['Label'] == 1))
print("I sani sono: ", sum(test['Label'] == 0))

y_train = train['Label']
x_train = train.drop(columns=['SampleID', 'Label', 'PatientID'])

y_test = test['Label']
x_test = test.drop(columns=['SampleID', 'Label', 'PatientID'])

Caricamento modelli

In [None]:
svc = joblib.load('../../Modelli/DatasetZeroes/svc.pkl')
randomForest = joblib.load('../../Modelli/DatasetZeroes/randomForest.pkl')
elasticNet = joblib.load('../../Modelli/DatasetZeroes/elasticNet.pkl')
knn = joblib.load('../../Modelli/DatasetZeroes/knn.pkl')
HistgradientBoosting = joblib.load('../../Modelli/DatasetZeroes/HistgradientBoostingWithFold.pkl')
GradientBoosting = joblib.load('../../Modelli/DatasetZeroes/GradientBoostingWithFold.pkl')
naiveBayes = joblib.load('../../Modelli/DatasetZeroes/naiveBayes.pkl')

Modello Ensemble

Nota: 
<br>
Nel caso datasetFull, la gridSearch con svc non ha probability = True
<br>
HistGradientBoosting si chiama histGradientBoostingWithFold
<br>
gradientBoosting non ha la gridSearch

In [None]:
newSvc = SVC(C=svc.best_params_['classifier__C'], kernel=svc.best_params_['classifier__kernel'], gamma=svc.best_params_['classifier__gamma'], probability=True)
newRandomForest = RandomForestClassifier(n_estimators=randomForest.best_params_['classifier__n_estimators'], max_depth=randomForest.best_params_['classifier__max_depth'])
newElasticNet = LogisticRegression(penalty=elasticNet.best_params_['classifier__penalty'], C=elasticNet.best_params_['classifier__C'], l1_ratio=elasticNet.best_params_['classifier__l1_ratio'], solver=elasticNet.best_params_['classifier__solver'])
newKnn = KNeighborsClassifier(n_neighbors=knn.best_params_['classifier__n_neighbors'], weights=knn.best_params_['classifier__weights'])
newHistGradientBoosting = HistGradientBoostingClassifier(learning_rate=HistgradientBoosting.best_params_['classifier__learning_rate'], max_iter=HistgradientBoosting.best_params_['classifier__max_iter'], max_depth=HistgradientBoosting.best_params_['classifier__max_depth'])
newGradientBoosting = GradientBoostingClassifier(learning_rate=gradientBoosting.best_params_['classifier__learning_rate'], n_estimators=gradientBoosting.best_params_['classifier__n_estimators'], max_depth=gradientBoosting.best_params_['classifier__max_depth'])
# newGradientBoosting = GradientBoostingClassifier(max_depth=5, n_estimators=150)
newNaiveBayes = GaussianNB();

In [None]:
ensembleModel = VotingClassifier(estimators=[('svc', newSvc), ('randomForest', newRandomForest), ('HistGradientBoosting', newHistGradientBoosting), ('gradientBoosting', newGradientBoosting), ('naiveBayes', newNaiveBayes)], voting='hard')
ensemblePipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('classifier', ensembleModel)])
ensemblePipeline.fit(x_train, y_train)

In [None]:
print("Ensemble model:")
print("Iperparametri: ", ensemblePipeline.named_steps['classifier'])
print("Training accuracy: ", ensemblePipeline.score(x_train, y_train))
print("Test accuracy: ", ensemblePipeline.score(x_test, y_test))
print(classification_report(y_test, ensemblePipeline.predict(x_test)))

In [None]:
weightSum = sum([model.best_score_ for model in [svc, randomForest, HistgradientBoosting, gradientBoosting]])
# weightSum += gradientBoosting.score(x_train, y_train)
weightSum += naiveBayes.score(x_train, y_train)
weights = [model.best_score_ / weightSum for model in [svc, randomForest, HistgradientBoosting, gradientBoosting]]
# weights.append(gradientBoosting.score(x_train, y_train) / weightSum)
weights.append(naiveBayes.score(x_train, y_train) / weightSum)

ensembleModelWeighted = VotingClassifier(estimators=[('svc', newSvc), ('randomForest', newRandomForest), ('HistgradientBoosting', newHistGradientBoosting), ('gradientBoosting', newGradientBoosting), ('naiveBayes', newNaiveBayes)],voting='soft', weights=weights, n_jobs=-1)
ensemblePipeline2 = Pipeline(steps=[('scaler', MinMaxScaler()), ('classifier', ensembleModelWeighted)])
ensemblePipeline2.fit(x_train, y_train)

In [None]:
print("Ensemble weighted model:")
print("Iperparametri: ", ensemblePipeline2.named_steps['classifier'])
print("Training accuracy: ", ensemblePipeline2.score(x_train, y_train))
print("Test accuracy: ", ensemblePipeline2.score(x_test, y_test))
print(classification_report(y_test, ensemblePipeline2.predict(x_test)))

In [None]:
def applyEnsemble(x, models):
    Y_pred_proba_ensemble = [0 for i in range(len(x))]
    count = 0
    for i in range(len(models)):
        gs=models[i]
        count+=1
        Y_pred_proba = gs.predict_proba(x)
        Y_pred_proba_ensemble = [Y_pred_proba_ensemble[k] + Y_pred_proba[k, 1] for k in range(len(x))]
    Y_pred_proba_ensemble = np.array(Y_pred_proba_ensemble)
    Y_pred_proba_ensemble = Y_pred_proba_ensemble / count
    Y_pred_ensemble = (Y_pred_proba_ensemble > 0.5)*1
    return Y_pred_ensemble

In [None]:
models = [svc, randomForest, HistgradientBoosting, gradientBoosting, naiveBayes]
trainResult = applyEnsemble(x_train, models)
testResult = applyEnsemble(x_test, models)

In [None]:
print("Manual ensemble model:")
print("Models: svc, randomForest, HistgradientBoosting, gradientBoosting, naiveBayes")
print("Training accuracy: ", sum(trainResult == y_train) / len(y_train))
print("Test accuracy: ", sum(testResult == y_test) / len(y_test))
print(classification_report(y_test, testResult))

<h1> DatasetFull </h1>

| Modello                     | Iperparametri                                                                                                               | Training Accuracy | Test Accuracy | Precision | Recall | F1-score | Support |
|-----------------------------|-----------------------------------------------------------------------------------------------------------------------------|-------------------|---------------|-----------|--------|----------|---------|
| SVC                         | {'classifier__C': 128, 'classifier__gamma': 0.0078125, 'classifier__kernel': 'rbf'}                                      | 0.9448            | 0.95          | 0.95      | 0.95   | 0.95     | 520     |
| Random Forest               | {'classifier__max_depth': 9, 'classifier__n_estimators': 200}                                                             | 0.9347            | 0.9423        | 0.94      | 0.94   | 0.94     | 520     |
| Elastic Net                 | {'classifier__C': 0.5, 'classifier__l1_ratio': 1, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'} | 0.7062            | 0.6865        | 0.68      | 0.69   | 0.68     | 520     |
| KNN                         | {'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}                                                          | 0.8920            | 0.8827        | 0.88      | 0.88   | 0.88     | 520     |
| Hist Gradient Boosting      | {'classifier__learning_rate': 0.1, 'classifier__max_depth': 9, 'classifier__max_iter': 200}                               | 0.9491            | 0.9538        | 0.95      | 0.95   | 0.95     | 520     |
| Gradient Boosting           | Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', GradientBoostingClassifier(max_depth=5, n_estimators=150))])  | 1.0               | 0.9442        | 0.94      | 0.95   | 0.94     | 520     |
| Naive Bayes                 | Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', GaussianNB())])                                               | 0.9153            | 0.9404        | 0.94      | 0.94   | 0.94     | 520     |
| Ensemble model              | VotingClassifier(estimators=[('svc', SVC(C=128, gamma=0.0078125, probability=True)), ...                                  | 1.0               | 0.95          | 0.95      | 0.95   | 0.95     | 520     |
| Ensemble weighted model     | VotingClassifier(estimators=[('svc', SVC(C=128, gamma=0.0078125, probability=True)), ...                                  | 1.0               | 0.9462        | 0.95      | 0.95   | 0.95     | 520     |
| Manual ensemble model       | Models: svc, randomForest, knn, HistgradientBoosting, gradientBoosting, naiveBayes                                         | 1.0               | 0.95          | 0.95      | 0.95   | 0.95     | 520     |


<h1> DatasetZeroes </h1>

| Modello                     | Iperparametri                                                                                                                               | Training Accuracy | Test Accuracy | Precision | Recall | F1-score | Support |
|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|-------------------|---------------|-----------|--------|----------|---------|
| SVC                         | {'classifier__C': 8, 'classifier__gamma': 0.0078125, 'classifier__kernel': 'rbf', 'classifier__probability': True}                        | 0.9278            | 0.9462        | 0.95      | 0.95   | 0.95     | 520     |
| Random Forest               | {'classifier__max_depth': 12, 'classifier__n_estimators': 225}                                                                           | 0.9353            | 0.9365        | 0.94      | 0.94   | 0.94     | 520     |
| Elastic Net                 | {'classifier__C': 0.5, 'classifier__l1_ratio': 1, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}                   | 0.6296            | 0.6404        | 0.67      | 0.63   | 0.61     | 520     |
| KNN                         | {'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}                                                                          | 0.6692            | 0.6769        | 0.73      | 0.69   | 0.67     | 520     |
| Hist Gradient Boosting      | {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__max_iter': 200}                                             | 0.9510            | 0.9558        | 0.96      | 0.96   | 0.96     | 520     |
| Gradient Boosting           | {'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__n_estimators': 150}       | 0.9485            | 0.95          | 0.95      | 0.95   | 0.95     | 520     |
| Naive Bayes                 | Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', GaussianNB())])                                                             | 0.9184            | 0.9404        | 0.94      | 0.94   | 0.94     | 520     |
| Ensemble model              | VotingClassifier(estimators=[('svc', SVC(C=8, gamma=0.0078125, probability=True)), ...                                                     | 0.9711            | 0.9462        | 0.95      | 0.95   | 0.95     | 520     |
| Ensemble weighted model     | VotingClassifier(estimators=[('svc', SVC(C=8, gamma=0.0078125, probability=True)), ...                                                     | 0.9912            | 0.9481        | 0.95      | 0.95   | 0.95     | 520     |
| Manual ensemble model       | Models: svc, randomForest, HistgradientBoosting, gradientBoosting, naiveBayes                                                            | 0.9887            | 0.9481        | 0.95      | 0.95   | 0.95     | 520     |


<h1> DatasetReduced </h1>

| Modello                     | Iperparametri                                                                                                                    | Training Accuracy | Test Accuracy | Precision | Recall | F1-score | Support |
|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------|-------------------|---------------|-----------|--------|----------|---------|
| SVC                         | {'classifier__C': 128, 'classifier__gamma': 0.0078125, 'classifier__kernel': 'rbf', 'classifier__probability': True}          | 0.9784            | 0.9526        | 0.95      | 0.95   | 0.95     | 464     |
| Random Forest               | {'classifier__max_depth': 12, 'classifier__n_estimators': 225}                                                                  | 0.9724            | 0.9353        | 0.94      | 0.94   | 0.94     | 464     |
| Elastic Net                 | {'classifier__C': 0.5, 'classifier__l1_ratio': 1, 'classifier__penalty': 'elasticnet', 'classifier__solver': 'saga'}        | 0.7284            | 0.7672        | 0.76      | 0.73   | 0.74     | 464     |
| KNN                         | {'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}                                                                 | 0.7970            | 0.7522        | 0.76      | 0.70   | 0.71     | 464     |
| Hist Gradient Boosting      | {'classifier__learning_rate': 0.1, 'classifier__max_depth': 9, 'classifier__max_iter': 100}                                    | 0.9784            | 0.9504        | 0.94      | 0.95   | 0.95     | 464     |
| Gradient Boosting           | {'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 3, 'classifier__n_estimators': 200} | 0.9754            | 0.9418        | 0.94      | 0.94   | 0.94     | 464     |
| Naive Bayes                 | Pipeline(steps=[('Scaling', MinMaxScaler()), ('classifier', GaussianNB())])                                                    | 0.9239            | 0.9052        | 0.93      | 0.88   | 0.89     | 464     |
| Ensemble model              | VotingClassifier(estimators=[('svc', SVC(C=128, gamma=0.0078125, probability=True)), ...                                          | 1.0               | 0.9461        | 0.95      | 0.94   | 0.94     | 464     |
| Ensemble weighted model     | VotingClassifier(estimators=[('svc', SVC(C=128, gamma=0.0078125, probability=True)), ...                                          | 1.0               | 0.9483        | 0.95      | 0.95   | 0.95     | 464     |
| Manual ensemble model       | Models: svc, randomForest, HistgradientBoosting, gradientBoosting, naiveBayes                                                   | 1.0               | 0.9461        | 0.94      | 0.94   | 0.94     | 464     |
