In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
stats_path = "./drive/MyDrive/Epilepsy"

In [116]:
folder_path = "../study"
stats_path = folder_path + "/stats"

In [117]:
# Reading the whole dataset
df = pd.read_csv("%s/dataset.csv" % stats_path, index_col="ID")

# Separing Data from targets
X = df.drop(["resp", "respPart"], axis=1)
y = df["resp"]

col_dMRI = X.filter(regex=r'mean|std|skew|kurt').columns
col_nTract = X.filter(regex=r'nTracts').columns
col_cont = ["age", "therapy_duration", "epilepsy_onset_age", "epilepsy_duration", *col_nTract, *col_dMRI]
col_disc = ["sex", "AEDs", "benzo", "epilepsy_type"]

print("X:", X.shape)
print("y:", y.shape)

X: (19, 2042)
y: (19,)


In [118]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
seed = 9
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
sss = StratifiedShuffleSplit(n_splits=5, test_size=1/5, random_state=seed)
for train, test in sss.split(X, y):
    print(train, test)

[15  8  6 16 10  9 14  4 12  2  3 11  0 13 17] [ 1 18  5  7]
[15 14  2 11  8  5  1 17  0  6 10 18 16  9  3] [ 4 12  7 13]
[13  6 11 12 16  8  1 18  3 14  9  2  4 10 17] [ 7  5 15  0]
[11 13  5 18 12 15  4  0 14  9  3  2 17  6 10] [ 8 16  1  7]
[17  3 11  4 13  0  7  2 18  6 16 10  5 14  8] [ 1 12 15  9]


In [119]:
seedOuter = 7
sssOuter = StratifiedShuffleSplit(n_splits=5, test_size=1/5, random_state=seed)
seedInner = 13
sssInner = StratifiedShuffleSplit(n_splits=5, test_size=1/5, random_state=seed)

# Building a model

## Histogram-based Gradient Boosting Classification Tree

This is a classififier similar to the **Gradient Boosting Classifier**, but it can work also with features that have NaN values. The implementation is based on [LightGBM](https://github.com/Microsoft/LightGBM). It's much faster than the normal implementation of the radient Boosting.

For binary classification is used a ```log_loss``` as loss for classification.
The number of bins is controlled by ```max_bins```. Using less bins acts as a form of regularization. It is generally reccomended to use as many bins as possible, which is the default.
The ```l2_regularization``` is a regulariazion term, and correspond to $\lambda$.
The easrly-stopping is controlled by ```early_stopping```, ```scoring```, ```validation_fraction```, ```n_iter_no_change```, and ```tol```. 

The algorithm has native support for categorical features. To enable this support, a boolean mask can be passed to ```categorical_feature```, indicating which feature is categorical. Or, one can pass a list of integers indicating the indices of the categorical features.

Can be declared some constraints to speed-up the algorithm: *Monotonic Constraint*, and *Interaction constraint*.

In [127]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import f_classif, mutual_info_classif, VarianceThreshold, SelectPercentile, SequentialFeatureSelector, RFECV, SelectKBest
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

def myScoreFunc(X, y):
    mi_score = mutual_info_classif(X, y)
    f_score = f_classif(X, y)[0]
    return mi_score + f_score

def estimate_score_NestedCV(estimator, X, y, hyperPars, cvOuter, cvInner):
    # Inner Cross-Validation for Hyper-parameters selection
    model = GridSearchCV(
        estimator=estimator,
        param_grid=hyperPars,
        scoring="roc_auc",
        n_jobs=-1,
        cv=cvInner,
        refit=True
    )

    # Outer Cross-Validaton for the estimation of the score
    scores = cross_val_score(
        model, X, y,
        scoring="roc_auc",
        cv=cvOuter,
        n_jobs=-1
    )

    print('ROC-AUC: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
    print("----------------------------")

hyperPars = {
    "hist__learning_rate" : [0.01, 0.1],
    "hist__max_iter" : [100, 200],
    "hist__max_depth" : [8, 16, 32],
    "hist__l2_regularization" : [0, 0.1, 1],
}

models = []

# Without feature Selection
for scaler in [StandardScaler(), RobustScaler()]:
    print("Scaler:", scaler.__class__.__name__, ", filter: No")

    # Scaling
    pre = ColumnTransformer(
        [("scaling", scaler, col_cont)],
        remainder="passthrough", # one hot or other stuff
    )

    mask_cat = np.array([False,]*X.shape[1])
    mask_cat[-4:] = True
    hist = HistGradientBoostingClassifier(
        categorical_features = mask_cat
    )

    # Pipe
    pipe = Pipeline([
        ("pre", pre),
        ("hist", hist)
    ])

    estimate_score_NestedCV(pipe, X, y, hyperPars, sssOuter, sssInner)

hyperPars["selection__n_features_to_select"] = [5, 10, 15]

# Forward method 
for scaler in [StandardScaler(), RobustScaler()]:
    print("Scaler:", scaler.__class__.__name__, ", filter: Forward")

    # Scaling
    pre = ColumnTransformer(
            [("scaling", scaler, col_cont)],
            remainder="passthrough", # one hot or other stuff
        )
    
    # Remove costant values
    # Yes, it's possible, since we have few data after the splitting and the cross validation is possible to have some features with same values. To remove them from the modeling we use the 
    varThres = VarianceThreshold()

    hist = HistGradientBoostingClassifier()

    # Feature Selection
    selection = SequentialFeatureSelector(
        estimator = hist, 
        direction="forward",
        scoring="roc_auc",
        cv=sss,
    )

    # Pipe
    pipe = Pipeline([
        ("pre", pre),
        ("varThres", varThres),
        ("selection", selection),
        ("hist", hist)
    ])

    estimate_score_NestedCV(pipe, X, y, hyperPars, sssOuter, sssInner)

del hyperPars["selection__n_features_to_select"]
hyperPars["selection__min_features_to_select"] = [2]

# Recoursive method 
for scaler in [StandardScaler(), RobustScaler()]:
    print("Scaler:", scaler.__class__.__name__, ", filter: Recoursive")

    # Scaling
    pre = ColumnTransformer(
            [("scaling", scaler, col_cont)],
            remainder="passthrough", # one hot or other stuff
        )
    
    # Remove costant values
    # Yes, it's possible, since we have few data after the splitting and the cross validation is possible to have some features with same values. To remove them from the modeling we use the 
    varThres = VarianceThreshold()

    hist = HistGradientBoostingClassifier()

    # Feature Selection
    selection = RFECV(
        estimator=hist,
        step=1,
        scoring="roc_auc",
        cv=sss,
        min_features_to_select=2,
        n_jobs=-1
    )

    # Pipe
    pipe = Pipeline([
        ("pre", pre),
        ("varThres", varThres),
        ("selection", selection),
        ("hist", hist)
    ])

    estimate_score_NestedCV(pipe, X, y, hyperPars, sssOuter, sssInner)


Scaler: StandardScaler , filter: Recoursive


KeyboardInterrupt: 

In [22]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression, ElasticNet, LinearRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, roc_auc_score

def myScoreFunc(X, y):
    mi_score = mutual_info_classif(X, y)
    f_score = f_classif(X, y)[0]
    return mi_score + f_score

models = {
    # SVM
    "linearSVM" : (
        LinearSVC(), 
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13, 17, 19),
            "classifier__C" : (1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1),
        }
    ),
    "SVM" : (
        SVC(),
        [
            {
                "selection__k" : (2, 3, 5, 7, 9, 11, 13, 17, 19),
                "classifier__C" :(1e-3, 1e-2, 1e-1, 1),
                "classifier__kernel" : ['linear'], 
            },
            {
                "selection__k" : (2, 3, 5, 7, 9, 11, 13, 17, 19),
                "classifier__C" :(1e-3, 1e-2, 1e-1, 1),
                "classifier__kernel" : ['poly'],
                "classifier__degree" : (2, 3, 4),
            },
            {
                "selection__k" : (2, 3, 5, 7, 9, 11, 13, 17, 19),
                "classifier__C" :(1e-3, 1e-2, 1e-1, 1),
                "classifier__kernel" : ['rbf'], 
                "classifier__gamma" : (1e-3, 1e-2, 1e-1, 1),
            },
        ]
    ),
    # Linear 
    "LogReg" : (
        LogisticRegression(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13, 17, 19),
            "classifier__penalty" : ["l2"],
            "classifier__dual" : [True],
            "classifier__C" :(1e-3, 1e-2, 1e-1),
        },
    ),
    "LinReg" : (
        LinearRegression(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13, 17, 19),
        }
    ),
    "ridgeReg" : (
        RidgeClassifier(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
            "classifier__alpha" : (0.5, 1, 5, 10, 20, 40)
        }
    ),
    "elasticNet" : (
        ElasticNet(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
            "classifier__alpha" : (0.5, 1, 5, 10, 20, 40),  # 0 == Linear Regression
            "classifier__l1_ratio" : (0.5, 1), # 0 == Ridge Regression, 1 == Lasso Regression
        },
    ),
    # Nearest Neighbors
    "neighbors" : (
        KNeighborsClassifier(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
            "classifier__n_neighbors" : (2, 3, 5),
            "classifier__weights" : ("uniform", "distance"),
        }
    ),
    # Naive Bayes
    "gaussianNaive" : (
        GaussianNB(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
        }
    ),
    "multinomialNaive" : (
        MultinomialNB(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
        }
    ),
    "complementNaive" : (
        ComplementNB(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
        }
    ),
    "bernulliNaive" : (
        BernoulliNB(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
        }
    ),
    "categoricalNaive" : (
        CategoricalNB(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
        }
    ),
    # Tree
    "tree" : (
        DecisionTreeClassifier(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
            "classifier__criterion" : ("gini", "entropy", "log_loss")
            # ccp_apha is a Regularization therm (to try)
        }
    ),
    # Ensemble
    "forest" : (
        RandomForestClassifier(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
            "classifier__n_estimators" : (100, 500, 1000),
            # Check if make sense choose also the criterion (see the tree)
            "classifier__bootstrap" : [True],
            "classifier__max_samples" : [0.5],
            "classifier__max_features" : ["log2"],
            "classifier__warm_star" : [True],
            "classifier__oob_score" : [True],
            "classifier__max_depth" : [10, 20, 30],
            # ccp_alpha da checkare
        }
    ),
    "extraForest" : (
        ExtraTreesClassifier(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
            "classifier__n_estimators" : (100, 500, 1000),
            # Check if make sense choose also the criterion (see the tree)
            "classifier__bootstrap" : [True],
            "classifier__max_samples" : [0.5],
            "classifier__max_features" : ["log2"],
            "classifier__warm_star" : [True],
            "classifier__oob_score" : [True],
            "classifier__max_depth" : [10, 20, 30],
            # ccp_alpha da checkare
        }
    ),
    "gradientBoosting": (
        GradientBoostingClassifier(),
        {
            "selection__k" : (2, 3, 5, 7, 9, 11, 13),
            "classifier__n_estimators" : (100, 200, 500, 1000),
            "classifier__learning_rate" : (0.05, 0.1),
            # Check if make sense choose also the criterion (see the tree)
            "classifier__max_samples" : [0.5],
            "classifier__max_features" : ["log2"],
            "classifier__warm_star" : [True],
            "classifier__max_depth" : [10, 20, 30],
            "classifier__validation_fraction" : [0.20],
            "classifier__n_iter_no_change" : [50],
            # ccp_alpha da checkare
        }
    )
}

# It's important to evaluate each algorithm with the same training data e test data
seed = 7
sss = StratifiedShuffleSplit(n_splits=20, test_size=1/3, random_state=seed)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

fitted_models = {}

for modelName, (classifier, grid) in models.items():
    for scaler in [StandardScaler(), RobustScaler()]:
        for filter in [f_classif, mutual_info_classif, myScoreFunc]:
            print("modelName:", modelName, ", scaler:", scaler.__class__.__name__, ", filter:", filter.__name__)

            # Scaling
            pre = ColumnTransformer(
                    [("scaling", scaler, col_cont)],
                    remainder="passthrough", # one hot or other stuff
                    n_jobs=-1
                )
            
            # Remove costant values
            # Yes, it's possible, since we have few data after the splitting and the cross validation is possible to have some features with same values. To remove them from the modeling we use the 
            varThres = VarianceThreshold()

            # Feature Selection
            selection = SelectKBest(
                    score_func=filter,
                )

            # Pipe
            pipe = Pipeline([
                ("pre", pre),
                ("varThres", varThres),
                ("selection", selection),
                ("classifier", classifier)
            ])

            model = GridSearchCV(
                estimator=pipe,
                param_grid=grid,
                scoring="roc_auc",
                n_jobs=-1,
                cv=skf,
            )

            model.fit(X_train, y_train)
            fitted_models[modelName] = model

            # Train
            print(modelName, "train score:", model.best_score_)
            print(model.best_params_) 

            idx = [param.__str__() for param in model.cv_results_["params"]]
            results = pd.DataFrame(pd.concat([pd.DataFrame(model.cv_results_["mean_test_score"], index=idx, columns=["mean_test_score"]), pd.DataFrame(model.cv_results_["std_test_score"], index=idx, columns=["std_test_score"]), pd.DataFrame(model.cv_results_["rank_test_score"], index=idx, columns=["rank_test_score"])], axis=1))
            print(results.sort_values("rank_test_score")[:10])

            # Test
            y_pred = model.predict(X_test)
            confusionMatrix = confusion_matrix(y_test, y_pred)  
            print(confusionMatrix)  
            print("Balanced Accuracy Score", balanced_accuracy_score(y_test, y_pred))
            print("Area Under ROC", roc_auc_score(y_test, y_pred,))


modelName: linearSVM , scaler: StandardScaler , filter: f_classif
linearSVM train score: 0.8333333333333334
{'classifier__C': 1e-06, 'selection__k': 17}
                                               mean_test_score   
{'classifier__C': 0.0001, 'selection__k': 19}         0.833333  \
{'classifier__C': 0.0001, 'selection__k': 17}         0.833333   
{'classifier__C': 0.001, 'selection__k': 17}          0.833333   
{'classifier__C': 1e-06, 'selection__k': 17}          0.833333   
{'classifier__C': 1e-06, 'selection__k': 19}          0.833333   
{'classifier__C': 0.001, 'selection__k': 19}          0.833333   
{'classifier__C': 0.01, 'selection__k': 19}           0.833333   
{'classifier__C': 0.01, 'selection__k': 17}           0.833333   
{'classifier__C': 1e-05, 'selection__k': 19}          0.833333   
{'classifier__C': 1e-05, 'selection__k': 17}          0.833333   

                                               std_test_score  rank_test_score  
{'classifier__C': 0.0001, 'selection__k

KeyboardInterrupt: 

In [54]:

idx = [param.__str__() for param in model.cv_results_["params"]]
grid = pd.DataFrame(pd.concat([pd.DataFrame(model.cv_results_["mean_test_score"], index=idx, columns=["mean_test_score"]), pd.DataFrame(model.cv_results_["std_test_score"], index=idx, columns=["std_test_score"]), pd.DataFrame(model.cv_results_["rank_test_score"], index=idx, columns=["rank_test_score"])], axis=1))
print(grid.sort_values("rank_test_score")[:10])

                                              mean_test_score  std_test_score   
{'classifier__C': 0.001, 'selection__k': 3}          0.766667        0.334996  \
{'classifier__C': 1e-06, 'selection__k': 3}          0.766667        0.334996   
{'classifier__C': 0.01, 'selection__k': 3}           0.766667        0.334996   
{'classifier__C': 0.0001, 'selection__k': 3}         0.766667        0.334996   
{'classifier__C': 1e-05, 'selection__k': 3}          0.766667        0.334996   
{'classifier__C': 0.1, 'selection__k': 3}            0.766667        0.334996   
{'classifier__C': 1, 'selection__k': 5}              0.750000        0.314024   
{'classifier__C': 0.001, 'selection__k': 5}          0.733333        0.326599   
{'classifier__C': 0.01, 'selection__k': 2}           0.733333        0.359011   
{'classifier__C': 0.01, 'selection__k': 5}           0.733333        0.326599   

                                              rank_test_score  
{'classifier__C': 0.001, 'selection__k': 3} 

In [37]:
[param.__str__() for param in model.cv_results_["params"]]

["{'classifier__C': 1e-06, 'selection__k': 2}",
 "{'classifier__C': 1e-06, 'selection__k': 3}",
 "{'classifier__C': 1e-06, 'selection__k': 5}",
 "{'classifier__C': 1e-06, 'selection__k': 7}",
 "{'classifier__C': 1e-06, 'selection__k': 9}",
 "{'classifier__C': 1e-06, 'selection__k': 11}",
 "{'classifier__C': 1e-06, 'selection__k': 13}",
 "{'classifier__C': 1e-06, 'selection__k': 17}",
 "{'classifier__C': 1e-06, 'selection__k': 19}",
 "{'classifier__C': 1e-05, 'selection__k': 2}",
 "{'classifier__C': 1e-05, 'selection__k': 3}",
 "{'classifier__C': 1e-05, 'selection__k': 5}",
 "{'classifier__C': 1e-05, 'selection__k': 7}",
 "{'classifier__C': 1e-05, 'selection__k': 9}",
 "{'classifier__C': 1e-05, 'selection__k': 11}",
 "{'classifier__C': 1e-05, 'selection__k': 13}",
 "{'classifier__C': 1e-05, 'selection__k': 17}",
 "{'classifier__C': 1e-05, 'selection__k': 19}",
 "{'classifier__C': 0.0001, 'selection__k': 2}",
 "{'classifier__C': 0.0001, 'selection__k': 3}",
 "{'classifier__C': 0.0001, 's