In [3]:
from utils.preprocessing import preprocessingV1
from utils.modelization import saveModel,loadModel,submitModel

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn import preprocessing

from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
#from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from scipy.stats import uniform, randint


In [9]:
df_train = pd.read_csv("datasets/train_radiomics_hipocamp.csv")
df_test = pd.read_csv("datasets/test_radiomics_hipocamp.csv")

X_train, y_train = preprocessingV1(df_train)

X_test = preprocessingV1(df_test,False)

In [45]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


dt_model = DecisionTreeClassifier(max_depth=4,random_state=2022)
bagg_dt_model = BaggingClassifier(estimator= dt_model, bootstrap= True,n_estimators=60)
bagg_rf_model = RandomForestClassifier(bootstrap= True, max_depth= 10,max_features = 'sqrt', min_samples_leaf=4, min_samples_split= 10, n_estimators= 100, random_state=123)

bagg_gb_model = GradientBoostingClassifier( learning_rate = 0.10690382853418266, max_depth = 6,
                                      max_features = None, min_samples_leaf = 8, min_samples_split = 14,
                                      n_estimators = 296, subsample = 0.9503681331729179, random_state=123)

xgb_model = XGBClassifier( colsample_bylevel= 0.7005087783306018, colsample_bytree = 0.8136585046688083, gamma = 0.16207544668977053,
                           learning_rate = 0.17342778308617596, max_depth = 7, min_child_weight = 1, n_estimators = 118, 
                           objective = 'multi:softmax', reg_alpha = 0.6317920176870504, reg_lambda = 0.44025717806407627, 
                           scale_pos_weight = 1.8372648456358434, subsample = 0.8561650906943812, random_state=123 )

lr_pipe = Pipeline(steps=[('anova',SelectKBest(score_func=f_classif, k=8)),  ('scaler', MinMaxScaler()),  ('lr', LogisticRegression(C =100, max_iter=1000,solver='newton-cg'))])

svc_pipe = Pipeline( steps=[('scaler',MinMaxScaler()),  ('svc', SVC(C = 1000, coef0 = 1.0, degree = 1, gamma = 'scale', kernel = 'sigmoid'))])
# KNN scaling


estimators = [ ("bagg_dt", bagg_dt_model), ("bagg_rf",bagg_rf_model), 
               ("bagg_gb" , bagg_gb_model), ( "xgb_model",xgb_model),
         #     ("SVC", SVC(random_state=123)), 
              ("lr_pipe", lr_pipe),
              ("KNN", KNeighborsClassifier(metric='euclidean', n_neighbors=7,weights='uniform')),
            #  ("gnb",  Pipeline( steps=[('scaler',MinMaxScaler()),('gnb', GaussianNB()) ]))
              ]


In [47]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=123)


st_model = StackingClassifier(estimators=estimators,final_estimator=RandomForestClassifier())

# st_model.fit(X_train, y_train)


grid_st = {
    'final_estimator' : [
                          #LogisticRegression(random_state=123),
                          RandomForestClassifier(random_state=123)
                          ]
}


grid_rf = GridSearchCV(estimator= st_model, 
                       param_grid= grid_st,
                       cv = cv,
                       refit=True,return_train_score=True, 
                       scoring= "f1_macro", n_jobs=-1, verbose= 1)

grid_rf.fit(X_train,y_train)

print(grid_rf.best_estimator_)

print('Best Mean Macro F1: %.3f' %  grid_rf.best_score_)
print('Best Config: %s' % grid_rf.best_params_)

Fitting 30 folds for each of 1 candidates, totalling 30 fits


Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



StackingClassifier(estimators=[('bagg_dt',
                                BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=4,
                                                                                   random_state=2022),
                                                  n_estimators=60)),
                               ('bagg_rf',
                                RandomForestClassifier(max_depth=10,
                                                       min_samples_leaf=4,
                                                       min_samples_split=10,
                                                       random_state=123)),
                               ('bagg_gb',
                                GradientBoostingClassifier(learning_rate=0.10690382853418266,
                                                           max_depth=6,
                                                           min_samples_leaf=8...
                                              n_estimators=1

In [None]:
# StackingClassifier(estimators=[('bagg_dt',
#                                 BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=4,
#                                                                                    random_state=2022),
#                                                   n_estimators=60)),
#                                ('bagg_rf',
#                                 RandomForestClassifier(max_depth=10,
#                                                        min_samples_leaf=4,
#                                                        min_samples_split=10,
#                                                        random_state=123)),
#                                ('bagg_gb',
#                                 GradientBoostingClassifier(learning_rate=0.10690382853418266,
#                                                            max_depth=6,
#                                                            min_samples_leaf=8...
#                                               n_estimators=118, n_jobs=None,
#                                               num_parallel_tree=None,
#                                               objective='multi:softmax', ...)),
#                                ('lr_pipe',
#                                 Pipeline(steps=[('anova', SelectKBest(k=8)),
#                                                 ('scaler', MinMaxScaler()),
#                                                 ('lr',
#                                                  LogisticRegression(C=100,
#                                                                     max_iter=1000,
#                                                                     solver='newton-cg'))])),
#                                ('KNN',
#                                 KNeighborsClassifier(metric='euclidean',
#                                                      n_neighbors=7))],
#                    final_estimator=RandomForestClassifier(random_state=123))
# Best Mean Macro F1: 0.327
# Best Config: {'final_estimator': RandomForestClassifier(random_state=123)}

In [None]:
#grid_rf.best_estimator_.predict(X_test)

#submitModel(grid_rf.best_estimator_.predict(X_test),'test_predictions_stacking_f1_0327')
#saveModel(grid_rf.best_estimator_,"stacking4_f1_0327")