In [1]:
!pip install lightgbm
!pip install shap

Collecting lightgbm
  Using cached https://files.pythonhosted.org/packages/78/7e/bc87e7951cfaa998cffaf39e6c721f5bd04efb2e139486206356edb289a5/lightgbm-2.2.1-py2.py3-none-manylinux1_x86_64.whl
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.1
Collecting shap
Collecting tqdm (from shap)
  Using cached https://files.pythonhosted.org/packages/91/55/8cb23a97301b177e9c8e3226dba45bb454411de2cbd25746763267f226c2/tqdm-4.28.1-py2.py3-none-any.whl
Installing collected packages: tqdm, shap
Successfully installed shap-0.24.0 tqdm-4.28.1


In [2]:
# Imports
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') 

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier


In [3]:
files_to_open = ['logistic_random_results', 'logistic_hyperopt_results', 'Random_Forest_random_results', 
                 'rf_hyperopt_results', 'lgb_random_results','lgb_hyperopt_results']

In [4]:
def df_open(f):
  
  data_frame = pd.read_csv(f)
  if 'hyperopt' in f and 'auc' not in data_frame.columns.values:
      data_frame['auc'] = 1- data_frame['loss']
      data_frame = data_frame.sort_values('auc', ascending = False)
  elif 'random' in f:
      data_frame = data_frame.sort_values('mean_test_score', ascending = False)
      
  return data_frame
  

In [5]:
#Dictionary of model results

def foo(files_to_load = []):
  results_dict = {}
  best_models_dict = {}
  for f in files_to_load:
    m = f.replace('results', 'clf')
    results_dict[f] = df_open(f)
    best_params = ast.literal_eval(results_dict[f]['params'][0])
    if 'logistic' in f:
      #Fitting with params which give highest score
      best_models_dict[m] = SGDClassifier(**best_params)
    elif 'Random_Forest' in f or 'rf_' in f:
      best_models_dict[m] = RandomForestClassifier(random_state=0, **best_params)
    elif 'lgb' in f:
      best_models_dict[m] = lgb.LGBMClassifier(**best_params)
      
  return results_dict, best_models_dict

In [6]:
results_dict, best_models_dict = foo(files_to_load=files_to_open)

## Table building

In [7]:
resampled_df = pd.read_csv('resampled_df_40')

In [8]:
resampled_df = resampled_df.drop(resampled_df.columns[0], axis = 1)
response='TARGET'
predictors= [col for col in resampled_df.columns if col != response]
X_resampled = resampled_df[predictors]
y_resampled = resampled_df[response]

In [11]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.model_selection import StratifiedKFold

In [12]:
def true_neg_rate(y_true, y_pred):
  c_mat = confusion_matrix(y_true, y_pred)
  return c_mat[0,0]/(c_mat[0,0] + c_mat[0,1])

c_mat = {'TNR': make_scorer(true_neg_rate), 'TPR':'recall', 'Acc': 'accuracy', 'AUC':'roc_auc'}

In [12]:
best_models_dict.keys()

dict_keys(['lgb_random_clf', 'rf_hyperopt_clf', 'lgb_hyperopt_clf', 'logistic_hyperopt_clf', 'logistic_random_clf', 'Random_Forest_random_clf'])

In [13]:
%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [19]:
cross_validate(best_models_dict['lgb_hyperopt_clf'], X_resampled, y_resampled, cv = 2, scoring=c_mat)

{'fit_time': array([4.83265996, 4.68605494]),
 'score_time': array([4.46168804, 4.2042129 ]),
 'test_AUC': array([0.77488398, 0.75473587]),
 'test_Acc': array([0.70648532, 0.69420117]),
 'test_TNR': array([0.70629371, 0.71159486]),
 'test_TPR': array([0.70675904, 0.66935224]),
 'train_AUC': array([0.82524044, 0.83907832]),
 'train_Acc': array([0.7461518 , 0.76015923]),
 'train_TNR': array([0.74515001, 0.75738777]),
 'train_TPR': array([0.74758298, 0.76411826])}

In [15]:
rows = best_models_dict
columns = ['TNR', 'TPR', 'Accuracy', 'AUC', 'Loss']
results = pd.DataFrame(0.0, columns=columns, index=rows.keys()) 
kf = StratifiedKFold(n_splits = 3)

for m in rows.keys():
    m_scores = cross_validate(rows[m], X_resampled, y_resampled, cv = kf, scoring=c_mat, return_train_score= False, verbose=True)
    results.loc[m,'TNR'] = np.mean(m_scores['test_TNR'])
    results.loc[m, 'TPR'] = np.mean(m_scores['test_TPR'])
    results.loc[m, 'Accuracy'] = np.mean(m_scores['test_Acc'])
    results.loc[m, 'AUC'] = np.mean(m_scores['test_AUC'])
    results.loc[m, 'Loss'] = -(1-results.loc[m,'TNR'])*2417 -(1-results.loc[m, 'TPR'])*1124
    
results = results.round(3)
results.to_csv('results_all_models_ex_NN.csv')

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.5s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.7s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.2s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.8s finished


In [16]:
results

Unnamed: 0,TNR,TPR,Accuracy,AUC,Loss
lgb_hyperopt_clf,0.709,0.694,0.703,0.767,-1046.996
lgb_random_clf,0.81,0.562,0.708,0.766,-949.994
rf_hyperopt_clf,0.947,0.141,0.615,0.69,-1093.273
logistic_random_clf,0.745,0.506,0.646,0.683,-1173.131
Random_Forest_random_clf,0.707,0.555,0.644,0.685,-1208.153
logistic_hyperopt_clf,0.752,0.506,0.651,0.685,-1154.393


In [24]:
NN_result_random = pd.read_csv('NN_results.csv', index_col=0)
combined_results = results.append(NN_result_random)

In [25]:
combined_results.to_csv('combined_results.csv')

Unnamed: 0,TNR,TPR,Accuracy,AUC,Loss
lgb_hyperopt_clf,0.709,0.694,0.703,0.767,-1046.996
lgb_random_clf,0.81,0.562,0.708,0.766,-949.994
rf_hyperopt_clf,0.947,0.141,0.615,0.69,-1093.273
logistic_random_clf,0.745,0.506,0.646,0.683,-1173.131
Random_Forest_random_clf,0.707,0.555,0.644,0.685,-1208.153
logistic_hyperopt_clf,0.752,0.506,0.651,0.685,-1154.393
NN_Random_clf,0.802,0.53,0.69,0.741,-1007.934
