<a href="https://colab.research.google.com/github/SohilaOsama/ChatBot/blob/main/code_4a_logreg_regz_search_oversample_decision_boundary_alpha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Section 1: Import

In [54]:
import pandas as pd
import numpy as np
import copy

from sklearn.model_selection import StratifiedKFold

from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import Counter

import statsmodels.api as sm

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

Section 2: Read, drop columns, form X_train and y_train

In [55]:
# ---------- read ----------

df_1 = pd.read_csv('/content/code_3_train.csv')

# ---------- drop ----------

print('Before drop :', df_1.shape)
df_1.drop(columns=['index',
                   'pco2', 'ph', 'basophils', 'lactic_acid', 'bmi',
                   'creatine_kinase', 'lymphocyte', 'neutrophils'], inplace=True)
print('After drop:', df_1.shape)
print('')

# ---------- form X ----------

X_train = df_1.drop(columns=['outcome'])
print('X_train :', X_train.shape)

# ---------- form y ----------

y_train = df_1['outcome']
print('y_train :', y_train.shape)
print('')
print('y_train :', np.unique(y_train, return_counts=True))
print('y_train :', Counter(y_train))
print(y_train.value_counts(normalize=True))

Before drop : (882, 51)
After drop: (882, 42)

X_train : (882, 41)
y_train : (882,)

y_train : (array([0., 1.]), array([763, 119]))
y_train : Counter({0.0: 763, 1.0: 119})
0.0    0.865079
1.0    0.134921
Name: outcome, dtype: float64


In [56]:
# ---------- this function is to retain the column names of dataframe after missing values imputation ----------

def impute_fit_transform(to_impute, to_impute_fit_transform):
    temp_df = []
    temp_df = to_impute_fit_transform.fit_transform(to_impute)
    temp_df = pd.DataFrame(temp_df, columns=to_impute.columns)
    to_impute = copy.deepcopy(temp_df)
    return to_impute, to_impute_fit_transform

def impute_transform(to_impute, to_impute_fit_transform):
    temp_df = []
    temp_df = to_impute_fit_transform.transform(to_impute)
    temp_df = pd.DataFrame(temp_df, columns=to_impute.columns)
    to_impute = copy.deepcopy(temp_df)
    return to_impute

Section 4: Function - Scale

In [57]:
# ---------- this function is to retain the column names of dataframe after scaling ----------

def scale_fit_transform(to_scale, to_scale_fit_transform):
    temp_df = []
    temp_df = to_scale_fit_transform.fit_transform(to_scale)
    temp_df = pd.DataFrame(temp_df, columns=to_scale.columns)
    to_scale = copy.deepcopy(temp_df)
    return to_scale, to_scale_fit_transform

def scale_transform(to_scale, to_scale_fit_transform):
    temp_df = []
    temp_df = to_scale_fit_transform.transform(to_scale)
    temp_df = pd.DataFrame(temp_df, columns=to_scale.columns)
    to_scale = copy.deepcopy(temp_df)
    return to_scale

Section 5: Manual search and cross validate

In [58]:
# ---------- set 15-fold cross validation ----------

cross_validate = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

In [59]:
# ---------- set parameters and hyperparameters ----------

search_parameters = {'oversample': [1.00, 0.85, 0.70],
                     'decision_boundary': [0.50, 0.35, 0.20]}

regular_alpha_from = 0
regular_alpha_to = 100
regular_alpha_count = 11

In [60]:
# ---------- initialise result storages ----------

combine_train_validate_accuracy = []
combine_train_validate_recall = []
combine_train_validate_precision = []

train_accuracy = []
train_recall = []
train_precision = []

validate_accuracy = []
validate_recall = []
validate_precision = []

In [61]:
# ---------- loop for oversample SMOTE ----------

for i in range(len(search_parameters['oversample'])):
  # ---------- loop for decision boundary ----------
  for j in range(len(search_parameters['decision_boundary'])):
    # ---------- loop for regularization strength alpha ----------
    for alpha_log_reg in np.linspace(regular_alpha_from, regular_alpha_to, regular_alpha_count):
      # ---------- loop for cross validation ----------
      for train_index, validate_index in cross_validate.split(X_train, y_train):
        # ---------- get train and validate indices ----------
        X_train_train, X_train_validate = X_train.iloc[train_index, :], X_train.iloc[validate_index, :]
        y_train_train, y_train_validate = y_train[train_index], y_train[validate_index]
        print('Before y_train_train:', Counter(y_train_train))
        print('Before y_train_validate:', Counter(y_train_validate))
        print('Oversample :', search_parameters['oversample'][i])
        print('Decision boundary :', search_parameters['decision_boundary'][j])
        print('alpha =', alpha_log_reg)
        print('')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
alpha = 100.0

Before y_train_train: Counter({0.0: 712, 1.0: 111})
Before y_train_validate: Counter({0.0: 51, 1.0: 8})
Oversample : 0.85
Decision boundary : 0.5
alpha = 100.0

Before y_train_train: Counter({0.0: 712, 1.0: 111})
Before y_train_validate: Counter({0.0: 51, 1.0: 8})
Oversample : 0.85
Decision boundary : 0.5
alpha = 100.0

Before y_train_train: Counter({0.0: 712, 1.0: 111})
Before y_train_validate: Counter({0.0: 51, 1.0: 8})
Oversample : 0.85
Decision boundary : 0.5
alpha = 100.0

Before y_train_train: Counter({0.0: 712, 1.0: 111})
Before y_train_validate: Counter({0.0: 51, 1.0: 8})
Oversample : 0.85
Decision boundary : 0.5
alpha = 100.0

Before y_train_train: Counter({0.0: 712, 1.0: 111})
Before y_train_validate: Counter({0.0: 51, 1.0: 8})
Oversample : 0.85
Decision boundary : 0.5
alpha = 100.0

Before y_train_train: Counter({0.0: 713, 1.0: 111})
Before y_train_validate: Counter({0.0: 50, 1.0: 8})
Oversample 

In [62]:
# ---------- impute on train ----------
    
knn_impute = KNNImputer(n_neighbors=5)
X_train_train_impute, knn_impute_fit_transform = impute_fit_transform(X_train_train,knn_impute)

In [63]:
# ---------- oversample on train ----------

o_s = SMOTE(random_state=42,sampling_strategy=search_parameters['oversample'][i])
X_train_train_impute_over, y_train_train_over = o_s.fit_resample(X_train_train_impute,y_train_train.ravel())
print('Before_Over y_train_train:', Counter(y_train_train_over))
print('')

Before_Over y_train_train: Counter({0.0: 712, 1.0: 498})



In [64]:
# ---------- scale on train ----------
    
ss = StandardScaler()
X_train_train_impute_over_scale, ss_fit_transform = scale_fit_transform(X_train_train_impute_over, ss)

In [65]:
# ---------- instantiate and fit regularized on train ----------
    
X_train_train_impute_over_scale = sm.add_constant(X_train_train_impute_over_scale)
lr = sm.Logit(y_train_train_over, X_train_train_impute_over_scale)
lr_result = lr.fit_regularized(maxiter=500, method='l1', alpha=alpha_log_reg)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6703622670739189
            Iterations: 29
            Function evaluations: 29
            Gradient evaluations: 29


  x = pd.concat(x[::order], 1)


In [66]:
X_train_train_impute= impute_transform(X_train_train,knn_impute_fit_transform)
X_train_train_impute_scale = scale_transform(X_train_train_impute, ss_fit_transform)

In [67]:
X_train_train_impute_scale = sm.add_constant(X_train_train_impute_scale)
y_train_train_predicted = (lr_result.predict(X_train_train_impute_scale)>=search_parameters['decision_boundary'][j]).astype(int)
train_accuracy.append(accuracy_score(y_train_train, y_train_train_predicted))
train_recall.append(recall_score(y_train_train, y_train_train_predicted))
train_precision.append(precision_score(y_train_train, y_train_train_predicted))

  x = pd.concat(x[::order], 1)


In [68]:
# ---------- impute and scale on validate ----------
    
X_train_validate_impute = impute_transform(X_train_validate,knn_impute_fit_transform)
    
X_train_validate_impute_scale = scale_transform(X_train_validate_impute,ss_fit_transform)
    

In [69]:
# ---------- predict and evaluate on validate ----------
X_train_validate_impute_scale = sm.add_constant(X_train_validate_impute_scale)
y_train_validate_predicted = (lr_result.predict(X_train_validate_impute_scale)>=search_parameters['decision_boundary'][j]).astype(int)

validate_accuracy.append(accuracy_score(y_train_validate, y_train_validate_predicted))
validate_recall.append(recall_score(y_train_validate, y_train_validate_predicted))
validate_precision.append(precision_score(y_train_validate, y_train_validate_predicted))
    
 
    
 

  x = pd.concat(x[::order], 1)


In [70]:
combine_train_validate_accuracy.append([search_parameters['oversample'][i],
                                                    search_parameters['decision_boundary'][j],
                                                    alpha_log_reg,
                                                    np.mean(train_accuracy),
                                                    np.std(train_accuracy),
                                                    np.mean(validate_accuracy),
                                                    np.std(validate_accuracy)])

In [71]:
combine_train_validate_recall.append([search_parameters['oversample'][i],
                                                  search_parameters['decision_boundary'][j],
                                                  alpha_log_reg,
                                                  np.mean(train_recall),
                                                  np.std(train_recall),
                                                  np.mean(validate_recall),
                                                  np.std(validate_recall)])

In [72]:
combine_train_validate_precision.append([search_parameters['oversample'][i],
                                                     search_parameters['decision_boundary'][j],
                                                     alpha_log_reg,
                                                     np.mean(train_precision),
                                                     np.std(train_precision),
                                                     np.mean(validate_precision),
                                                     np.std(validate_precision)])

In [73]:
train_accuracy = []
train_recall = []
train_precision = []

validate_accuracy = []
validate_recall = []
validate_precision = []

Section 6: Save results

In [74]:
combine_train_validate_accuracy = pd.DataFrame(combine_train_validate_accuracy,
                                               columns=['oversample',
                                                        'decision_boundary',
                                                        'alpha',
                                                        'train_accuracy_mean',
                                                        'train_accuracy_std_dev',
                                                        'validate_accuracy_mean',
                                                        'validate_accuracy_std_dev'])
combine_train_validate_accuracy['overfit'] = (combine_train_validate_accuracy['validate_accuracy_mean']-combine_train_validate_accuracy['train_accuracy_mean'])
combine_train_validate_accuracy['train_accuracy_mean']*100
combine_train_validate_accuracy.to_csv('/content/code_4_logreg_regz_train_validate_accuracy.csv', na_rep='NaN', index_label='index')

In [76]:
combine_train_validate_recall = pd.DataFrame(combine_train_validate_recall,
                                             columns=['oversample',
                                                      'decision_boundary',
                                                      'alpha',
                                                      'train_recall_mean',
                                                      'train_recall_std_dev',
                                                      'validate_recall_mean',
                                                      'validate_recall_std_dev'])
combine_train_validate_recall['overfit'] = (combine_train_validate_recall['validate_recall_mean']-combine_train_validate_recall['train_recall_mean'])/combine_train_validate_recall['train_recall_mean']*100
combine_train_validate_recall.to_csv('/content/code_4_logreg_regz_train_validate_recall.csv', na_rep='NaN', index_label='index')

In [77]:
combine_train_validate_precision = pd.DataFrame(combine_train_validate_precision,
                                                columns=['oversample',
                                                         'decision_boundary',
                                                         'alpha',
                                                         'train_precision_mean',
                                                         'train_precision_std_dev',
                                                         'validate_precision_mean',
                                                         'validate_precision_std_dev'])
combine_train_validate_precision['overfit'] = (combine_train_validate_precision['validate_precision_mean']-combine_train_validate_precision['train_precision_mean'])/combine_train_validate_precision['train_precision_mean']*100
combine_train_validate_precision.to_csv('/content/code_4_logreg_regz_train_validate_precision.csv', na_rep='NaN', index_label='index')