# Model Tuning -- before count binarization

## Outline

The **MLAging** workflow consists of four sections:

I. Data Preprocessing in Seurat ```preprocessing.R```

II. Model Tunning (hyperparameter selection for  with ```GridSearchCV```) -- **this notebook: same as *Model Tuning* notebook except for the count matrix binarization**:

1. [Data Preparation](#1.-prep)
2. [Model Tunning](#2.-tunning)
    - [Lasso](#3.-l1)
    - [Ridge](#4.-l2)
    - [ElasticNet](#5.-eln)
    
    - [Random Forest](#6.-rfc)
    - [XGBoost](#7.-xgbc)
    
    - [Support Vector Machine with rbf kernel](#8.-svc)

III. Model Comparison

IV. Final Model Over 10 Random States

V. Results and Intepretations

In [None]:
import warnings
warnings.filterwarnings('ignore')

from src.data_processing import *
from src.grid_search import *
import os
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import pickle

data_type = 'float32'

## 1. Data Preparation <a name="1.-prep"></a>
### Load training, testing batch

In [None]:
input_test = '../data/test_final_group_info.csv'
input_train = '../data/train_final_group_info.csv'

cell_type = 'All'

In [None]:
train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=False)

In [None]:
pr_auc_scorer = make_scorer(pr_auc_score, greater_is_better=True,
                            needs_proba=True)

## 2. Model tunning<a name="2.-tunning"></a>

### 1) Logistic regression -- l1<a name="3.-l1"></a>

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
l1 = LogisticRegression(penalty='l1', solver='saga', max_iter=10000000)
# 0.01, 0.05, 0.1, 0.5, 1, 5, 8, 10, 20, 50, 100 
# 12.5, 15, 17.5, 20, 25, 30, 35, 40
param_grid = {'logisticregression__C': np.logspace(-3, 2, 10)}

models_l1 = []
for i in tqdm(range(10)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, 
                                                l1, param_grid, i, custom_cv, pr_auc_scorer)
    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_l1.append(grid)

In [None]:
file = open('../results/all_cells_before/l1_models_10.save', 'wb')
pickle.dump(models_l1, file)
file.close()

### 2) Logistic regression -- l2<a name="4.-l2"></a>

In [None]:
l2 = LogisticRegression(penalty='l2', solver='saga', max_iter=10000000)
param_grid = {'logisticregression__C': np.logspace(-3, 2, 10)}

models_l2 = []
for i in tqdm(range(10)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, l2, param_grid, i)
    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_l2.append(grid)

In [None]:
file = open('../results/all_cells_before/l2_models_10.save', 'wb')
pickle.dump(models_l2, file)
file.close()

np.logspace(-3, 2, 5)

### 3) Logistic regression -- ElasticNet<a name="5.-eln"></a>

In [None]:
eln = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000000)
param_grid = {'logisticregression__C':  np.logspace(-3, 2, 10),
             'logisticregression__l1_ratio': [0.05, 0.1, 0.2, 0.35]}

models_eln = []
for i in tqdm(range(10)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, 
                                                eln, param_grid, i, custom_cv, pr_auc_scorer)
    
    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_eln.append(grid)

In [None]:
file = open('../results/all_cells_before/eln_models_10_finer.save', 'wb')
pickle.dump(models_eln, file)
file.close()

### 4) Random Forest Classifier<a name="6.-rfc"></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()
param_grid = {'randomforestclassifier__max_features': [10, 15, 20, 25, 50, None],
              'randomforestclassifier__max_depth': [10, 20, 30, 50, 100, None],
              'randomforestclassifier__min_samples_split': [2, 5, 10, 20]}

models_rfc = []
for i in tqdm(range(10)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, rfc, param_grid, i)

    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_rfc.append(grid)

In [None]:
file = open('../results/all_cells/rfc_models_10.save', 'wb')
pickle.dump(models_rfc, file)
file.close()

### 5) XGBoost Classifier<a name="#7.-xgbc"></a>

In [None]:
import xgboost
from xgboost import XGBClassifier

xgbc = XGBClassifier(use_label_encoder=False)
param_grid = {'xgbclassifier__max_depth': [1, 3, 5, 10, 20, 30, 100],
              "xgbclassifier__learning_rate": [0.03],
              #'xgbclassifier__min_child_weight': [1, 3, 5, 7],
              #'xgbclassifier__gamma': [0, 0.1, 0.2 , 0.3, 0.4],
              'xgbclassifier__colsample_bytree': [0.9],
              'xgbclassifier__subsample': [0.66],
              'xgbclassifier__eval_metric': ['logloss']}

models_xgbc = []
for i in tqdm(range(10)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, xgbc, param_grid, i, xgbc=True)
    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_xgbc.append(grid)

In [None]:
file = open('../results/all_cells_before/xgbc_models_10.save', 'wb')
pickle.dump(models_xgbc, file)
file.close()

### 6) SVC<a name="8.-svc"></a>

In [None]:
from sklearn.svm import SVC
svc = SVC(probability=True)
# 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1
param_grid = {'svc__gamma': np.logspace(-3, 2, 6),
              'svc__C': np.logspace(-3, 2, 6)}

models_svc = []
for i in tqdm(range(10)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, svc, param_grid, i)
    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_svc.append(grid)

In [None]:
file = open('../results/all_cells_before/svc_models_10.save', 'wb')
pickle.dump(models_svc, file)
file.close()