# ML Models
by __Pawel Rosikiewicz__ 

---

## Setup
---

__global imports__
* I purposely placed other imports, such as my custom made functions for thsi project in each section
* to allow you fast inspection of my code, but also, copying these important to new notebooks, for pipeline development

In [1]:
import os
import sys
import re # module to use regular expressions, 
import glob # lists names in folders that match Unix shell patterns
import warnings
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn import set_config

In [2]:
# basedir
basedir = os.path.dirname(os.getcwd())
os.chdir(basedir)
sys.path.append(basedir)

In [3]:
# paths
PATH_data_raw     = os.path.join(basedir, "data/raw")
PATH_data_interim = os.path.join(basedir, "data/interim")
PATH_results      = os.path.join(basedir, "data/results")
PATH_models       = os.path.join(basedir, "models")

__load functions, and classes created for that project__

In [25]:
# classifiers used
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# pipeline and model selection
from sklearn import set_config
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import make_pipeline, Pipeline # like pipeline function, but give step names automatically, 
from sklearn.decomposition import PCA

# feature transformations
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer # creates custom transfomers
from sklearn.compose import ColumnTransformer # allows using different transformers to different columns
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer # skleanr transformers
from sklearn.preprocessing import RobustScaler # creates custom transfomers

# my custom fucntions, 
from src.utils.helper_data_loaders import load_tsv
from sklearn.metrics import classification_report
from src.utils.helper_merge_dfs import merge_data

__configurations__

## PART 1. RUN SELECTED MODELS

__I selected the foolowing__
* __most frequent baseline__, because we have two classes, and with 3/4 examples form one class, this will be difficult to improve with most models, usnig "inaproprietly" or badly prepared data
* __out of the box fast ML models__, because I have a time and resource limitation, and my mani focus was to create a working data-centric piepline, that may be used with hudreds of models and hundreds of dataset variants. These were:
    * knn
    * logistic regression
    * random forest
    
__all models were trained with__
* all the datsets that I created in notebook 3
* large number of differen hyperparameters, 

__all models provided predictions__
* to examtly the same data points, in train/valid data that allows me to perform error analysis such as:
    * consistently misclassified samples
* __IMPORTANT__; my pipeline in notebook 02 and 03, allows createing any number of train/validation dataset combinations, Ideally I woudl create several of these datsets, and evaluate each model on all of them, and performing cross-validation. It was not done only because of lack of time, but the pipeline provides all the possibilities for that.
* moreover, based on error analysis, I woudl improve my CV by stratyfying, each class, and in the future, by adding synthetic data, eg.with:
    * noise, such as missign data, 
    * feature exchnage between samples, that clustered differently in error analysis
    * completely synthetic data, created from average profiles in each strata and random noise
    

In [27]:
# function, ...........................................................
def load_dataset_dct(dataset_name, path, rand_nr=0, none_at=None, verbose=False):

    os.chdir(path)

    # . find names of the dataset_to_load 
    dataset_to_load = []
    for file in glob.glob(f"{dataset_name}_v{rand_nr}__transf_data_dct.p"):
        dataset_to_load.append(file)

    # . find name of the qc reports, 
    qc_report_to_load = []
    for file in glob.glob(f"{dataset_name}_v{rand_nr}__qc_reports_dct.p"):
        qc_report_to_load.append(file)

    # . load all of them 
    with open(dataset_to_load[0], 'rb') as file: 
            dataset_dct  = pickle.load(file)  
    with open(qc_report_to_load[0], 'rb') as file: 
            qc_report_dct  = pickle.load(file)  


    # extract & combine the data ..................................  
    if verbose==True:
        print(dataset_to_load)
    else:
        pass
    x_train = merge_data(dataset_dct["tpm_data"]["train"], dataset_dct["covariants_data"]["train"], none_at=none_at, verbose=verbose)
    y_train = dataset_dct['target_data']["train"]
    x_valid = merge_data(dataset_dct["tpm_data"]["test0"], dataset_dct["covariants_data"]["test0"], none_at=none_at, verbose=verbose)
    y_valid = dataset_dct['target_data']["test0"]
    x_test = merge_data(dataset_dct["tpm_data"]["test1"], dataset_dct["covariants_data"]["test1"], none_at=none_at, verbose=verbose)

    return x_train, x_valid, x_test, y_train, y_valid
    

    

## __(A) Knn model__

In [35]:
model_name = "knn"
rand_nr_list = [0,1,2] # ie. there are 3 datasets for cv
dataset_name_list = ["P17_G100", "P17_G2000", "P17_G2000_PCA",
                     "P17_G100_LOG", "P17_G2000_LOG", "P17_G2000_LOG_PCA"] # 6 dataset varinats prepared in different way, 
param_grid = ParameterGrid({  
            'n_neighbors':    list(range(2,20,2)),    # I do not use k==1, ! # I tried higher values and they were not very usefull
            'weights':        ['uniform','distance'], # Weighting function
            'p':              [2],                    # L1 and L2 distance metrics
        })
model = KNeighborsClassifier()
path = PATH_data_interim
rand_nr=2
none_at=None
verbose=True

In [33]:
#clf = LogisticRegression(solver='liblinear') # the solver must be specified, otherwise it wil raise an error, 
#clf = RandomForestClassifier(max_depth=10, n_estimators=500)  

In [40]:
for dataset_name in dataset_name_list:
    for rand_nr in rand_nr_list:
        ''' each dataset and random_nr is saved separately
            IDs correposnd to parameters, 
            and models with the same id's can be used to get means
        '''
        # empty lists for results 
        test_true_values = []
        test_predictions = []
        valid_predicitons =[]
        model_stats =[] # roc auc, acc, spec, and sensitivity
        model_params = []
        model_ID=-1 # to iniciate
        
        # loop over params
        for params in param_grid:

            # load data
            ''' here I am dealing with small data, 
                otherwise I woudl load them only once, per dataset/rand_nr
                and check for modificaitons, 
            '''
            model_ID+=1
            x_train, x_valid, x_test, y_train, y_valid = load_dataset_dct(
                dataset_name, path, rand_nr, none_at=none_at)

            # set params
            model.set_params(**params)
            
            # Fit model to train data
            model.fit(x_train, y_train)

            # STATS
            file_name = f'{model_name}__{dataset_name}_v{rand_nr}'
            acc_train = model.score(x_train, y_train)
            acc_valid = model.score(x_valid, y_valid)
        
            # print accuracy
            if verbose==True:
                print(f'{file_name} - {params} - Acc: Train: {np.round(acc_train,2)}, Test {np.round(acc_valid,2)}')
            else:
                pass

            # 
        
        
        
        
        #report = classification_report(y_true=y_valid, y_pred=model.predict(x_valid))
        #print(report)

        #model

knn__P17_G100_v0 - {'n_neighbors': 2, 'p': 2, 'weights': 'uniform'} - Acc: Train: 0.84, Test 0.72
knn__P17_G100_v0 - {'n_neighbors': 2, 'p': 2, 'weights': 'distance'} - Acc: Train: 1.0, Test 0.64
knn__P17_G100_v0 - {'n_neighbors': 4, 'p': 2, 'weights': 'uniform'} - Acc: Train: 0.83, Test 0.7
knn__P17_G100_v0 - {'n_neighbors': 4, 'p': 2, 'weights': 'distance'} - Acc: Train: 1.0, Test 0.66
knn__P17_G100_v0 - {'n_neighbors': 6, 'p': 2, 'weights': 'uniform'} - Acc: Train: 0.83, Test 0.7
knn__P17_G100_v0 - {'n_neighbors': 6, 'p': 2, 'weights': 'distance'} - Acc: Train: 1.0, Test 0.66
knn__P17_G100_v0 - {'n_neighbors': 8, 'p': 2, 'weights': 'uniform'} - Acc: Train: 0.83, Test 0.74
knn__P17_G100_v0 - {'n_neighbors': 8, 'p': 2, 'weights': 'distance'} - Acc: Train: 1.0, Test 0.68
knn__P17_G100_v0 - {'n_neighbors': 10, 'p': 2, 'weights': 'uniform'} - Acc: Train: 0.79, Test 0.74
knn__P17_G100_v0 - {'n_neighbors': 10, 'p': 2, 'weights': 'distance'} - Acc: Train: 1.0, Test 0.7
knn__P17_G100_v0 - {'

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# train model ..................................
model = LogisticRegression(solver='liblinear') # the solver must be specified, otherwise it wil raise an error, 
model = RandomForestClassifier(max_depth=10, n_estimators=500)   
    
# Fit it to train data
model.fit(x_train, y_train)

# print accuracy
print(f'Train Accuracy: {model.score(x_train, y_train)}')
print(f'Test Accuracy: {model.score(x_valid, y_valid)}')

report = classification_report(y_true=y_valid, y_pred=model.predict(x_valid))
print(report)

model

Train Accuracy: 1.0
Test Accuracy: 0.8
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        32
           1       0.00      0.00      0.00         8

    accuracy                           0.80        40
   macro avg       0.40      0.50      0.44        40
weighted avg       0.64      0.80      0.71        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier(max_depth=10, n_estimators=500)

In [None]:

# parameters used to train skilearn dn cnn tranfer learning models
MODEL_PARAMETERS_GRID = {
    
    "knn" : ParameterGrid([
        {     
            'method_group':   ["sklearn_models"],   # for all classical models, 
            'method_variant': ["no_pca"],       # eg: SVM has linear or rbf, or typically pca or not, (nothing)
            'n_neighbors':    list(range(2,10,2)),    # I do not use k==1, ! # I tried higher values and they were not very usefull
            'weights':        ['uniform','distance'], # Weighting function
            'p':              [2],                  # L1 and L2 distance metrics
            'pca':            [0],                  # is 0, no PCA applied, 
            'random_state_nr':[0],                  # in the list
            'pc': [['n_neighbors', 'weights', 'p']] # parameter names in the grid for classifier, (list in list)
        },
        {
            'method_group':   ["sklearn_models"],   # for all classical models, 
            'method_variant': ["pca"],          # eg: SVM has linear or rbf, or typically pca or not, (nothing)
            'n_neighbors':    list(range(2,10,2)),                  # I do not use k==1, ! # I tried higher values and they were not very usefull
            'weights':        ['uniform','distance'],          # Weighting function
            'p':              [2],                  # L1 and L2 distance metrics
            'pca':            [250],                # is 0, no PCA applied, 
            'random_state_nr':[0],                  # in the list
            'pc': [['n_neighbors', 'weights', 'p']] # parameter names in the grid for classifier, (list in list)  
        }
        ]), 
    
    
    'random_forest': ParameterGrid([
        {
            'method_group': ["sklearn_models"],     # for all classical models, 
            'method_variant': ["no_pca"],           # eg: SVM has linear or rbf, or typically pca or not, (nothing)
            'random_state_nr':[0],
            'max_depth':[4,5,6],
            'n_estimators': [10,25,50,100,150, 200],
            'class_weight': ['balanced'],
            'pca':[0],
            'pc':[['n_estimators', 'max_depth', 'class_weight']]
        },
        {
            'method_group': ["sklearn"],           # for all classical models, 
            'method_variant': ["pca"],           # eg: SVM has linear or rbf, or typically pca or not, (nothing)
            'random_state_nr':[0],
            'max_depth':[4,5,6],
            'n_estimators': [10,25,50,100,150, 200],
            'class_weight': ['balanced'],
            'pca':[30, 200],
            'pc':[['n_estimators', 'max_depth', 'class_weight']]
        }
       ]),