# Module and Data Importation

In [1]:
import pandas as pd
import pickle
import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split


## Utilities

In [2]:
import time
import matplotlib.pyplot as plt

## Modeling Tools

In [3]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.dummy import DummyClassifier
from sklearn.utils import resample
from bayes_opt import BayesianOptimization

# Load our Xs and Ys

In [4]:
X_train = pickle.load(open("X_train", 'rb'))
X_test = pickle.load(open("X_test", 'rb'))
y_train = pickle.load(open("y_train", 'rb'))
y_test = pickle.load(open("y_test","rb"))

### Helper Funcitons

In [5]:
def timing_buddy(model_to_fit, dictionary_string, dicto):
    funky_time_start = time.time()
    model_to_fit.fit(X_train, np.ravel(y_train))
    funky_time_stop = time.time()
    funky_train_time = funky_time_stop - funky_time_start
    dicto[dictionary_string]['training_time'] = funky_train_time
    

In [6]:
def confused_buddy(model_to_confuse, dictionary_string,dicto):
    confuse = confusion_matrix(y_test, model_to_confuse.predict(X_test))
    dicto[dictionary_string]['confuse'] = confuse

In [7]:
def recall_calculator(confuse):
    recall = 0
    tp = confuse[1][1]
    fn = confuse[1][0]
    if tp > 0 or tp ==1:
        recall = tp / (tp+fn)
    return recall

In [8]:
def precision_calculator(confuse):
    precision = 0
    print(confuse)
    tp = confuse[1][1]
    fp = confuse[0][1]
    if tp > 0 or tp ==1:
        precision = tp / (tp+fp)
    return precision

## Dummy

In [9]:
def classy_dummy(dicto, X_train, y_train, X_test, y_test):
    print('dumbo')
    dummy_clf = DummyClassifier(strategy="stratified")
    timing_buddy(dummy_clf, 'dummy', dicto)
    confused_buddy(dummy_clf, 'dummy', dicto)
    confused_buddy(dummy_clf, 'dummy', dicto)
    dicto['dummy']['ROC_AUC_Score'] = roc_auc_score(y_test, dummy_clf.predict_proba(X_test)[:, 1])
    

## SVC

In [10]:
def SVCmodeler(dicto, X_train, y_train, X_test, y_test):
    print('Support Vector Classification')
    SupportVectorClassifier = SVC()
    timing_buddy(SupportVectorClassifier, 'SVC', dicto)
    confused_buddy(SupportVectorClassifier, 'SVC', dicto)
    dicto['SVC']['ROC_AUC_Score'] = roc_auc_score(y_test, SupportVectorClassifier.decision_function(X_test))


## Stochastic Gradiant Descent 

In [11]:
def grid_sgd_model(dicto, X_train, y_train, X_test, y_test):
    
    def SGDHyperParameterer(loss_penalty_list):
        SGD_clf = SGDClassifier(loss = loss_penalty_list[0], penalty=loss_penalty_list[1])
        SGD_clf.fit(X_train, np.ravel(y_train))
        confuse = confusion_matrix(y_test, SGD_clf.predict(X_test))
        recall = recall_calculator(confuse)
        return recall
    
    
    losses= ['modified_huber', 'log']
    penalties = ['l2', 'l1', 'elasticnet']
    param_grid = {}
    count = 0
    for each_loss in losses:
        for each_penalty in penalties:
            param_grid[str(each_loss) + '-' + str(each_penalty)] = [each_loss, each_penalty]
            count+=1

    
    ###
    SGD_Hyper_param_recall_scores = {}
    for key, value in param_grid.items():
        SGD_Hyper_param_recall_scores[key] = SGDHyperParameterer(value)

    ###
    
    print('Stochastic Gradiant Descent')
    best_recall = 0
    penalty = ''
    loss = ''
    for key, value in SGD_Hyper_param_recall_scores.items():
        if value > best_recall:
            best_recall = value
            loss, penalty = key.split('-')
            
        
                                                                 
    
    ###
    SGD_clf = SGDClassifier(loss = loss, penalty= penalty)
    timing_buddy(SGD_clf, 'SGDClassfier', dicto)
    confused_buddy(SGD_clf, 'SGDClassfier', dicto)
    dicto['SGDClassfier']['ROC_AUC_Score'] = roc_auc_score(y_test, SGD_clf.decision_function(X_test))
    dicto['SGDClassfier']['Hyper_Params'] = {'Loss': loss, 'penalty':penalty}

## Random Forest Model

In [12]:
def bayes_rf_model(dicto, X_train, y_train, X_test, y_test):
    
    def bayes_RFModeler_sqrt(n_estimators, 
                             max_depth,
                             min_samples_split,
                             min_samples_leaf):
        RandoForest = RandomForestClassifier(n_estimators=int(n_estimators), max_features='sqrt', max_depth=int(max_depth),
                                             min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
        RandoForest.fit(X_train, np.ravel(y_train))
        confuse = confusion_matrix(y_test, RandoForest.predict(X_test))
        recall = recall_calculator(confuse)
        return recall

    
    ###
    param_dicts = { 'n_estimators' : (100, 2000),
              'max_depth' : (10,60),
              'min_samples_split': (2,10),
              'min_samples_leaf' : (1,5),
              }
    ###
    
    optimizer = BayesianOptimization(
    bayes_RFModeler_sqrt,
    pbounds=param_dicts,
    verbose=1)
    
    ######
    
    print('Random Forest')
    optimizer.maximize(init_points=15, n_iter=10)
    rf_params= optimizer.max
    rf_params = rf_params['params']
    
    ###
    RandoForest = RandomForestClassifier(n_estimators=int(rf_params['n_estimators']), 
                                         max_features='sqrt',
                                         max_depth=int(rf_params['max_depth']),
                                         min_samples_split=int(rf_params['min_samples_split']),
                                         min_samples_leaf=int(rf_params['min_samples_leaf']))
    
    timing_buddy(RandoForest, 'RandomForestClassifier', dicto)
    confused_buddy(RandoForest, 'RandomForestClassifier', dicto)
    dicto['RandomForestClassifier']['ROC_AUC_Score'] = roc_auc_score(y_test, RandoForest.predict_proba(X_test)[:, 1])
    dicto['RandomForestClassifier']['Hyper_Params'] = rf_params

## AdaBooast Classifier

In [13]:
def bayes_hyper_ada_boost_model(dicto, X_train, y_train, X_test, y_test):
    
    def AdaBoost_hyper(n_estimators, learning_rate):
        adaira_the_classifier = AdaBoostClassifier(learning_rate=learning_rate,  n_estimators=int(n_estimators))
        adaira_the_classifier.fit(X_train, np.ravel(y_train))
        confuse = confusion_matrix(y_test, adaira_the_classifier.predict(X_test))
        recall = recall_calculator(confuse)
        return recall

    
    ###
    ada_params = {'learning_rate' : (.1,2),
              'n_estimators':(10,500),}
    ###
    
    optimizer = BayesianOptimization(
    AdaBoost_hyper,
    pbounds=ada_params,
    verbose=1)
    
    ######
    
    print('AdaBoost')
    optimizer.maximize(init_points=15, n_iter=10)
    ada_boost_params= optimizer.max
    ada_boost_params = ada_boost_params['params']
    
    ###
    adaira_the_classifier = AdaBoostClassifier(learning_rate=ada_boost_params['learning_rate'],  n_estimators=int(ada_boost_params['n_estimators']))
    timing_buddy(adaira_the_classifier, 'AdaBoostClassifier', dicto)
    confused_buddy(adaira_the_classifier, 'AdaBoostClassifier', dicto)
    dicto['AdaBoostClassifier']['ROC_AUC_Score'] = roc_auc_score(y_test, adaira_the_classifier.predict_proba(X_test)[:, 1])
    dicto['AdaBoostClassifier']['Hyper_Params'] = ada_boost_params

## Gradiant Boosting Model

In [14]:
def bayes_hyper_grad_boosting_model(dicto, X_train, y_train, X_test, y_test):
    
    def GradBoost_hyper(n_estimators, learning_rate, max_leaf_nodes):
        grady_the_boosted = GradientBoostingClassifier(learning_rate=learning_rate,  n_estimators=int(n_estimators),
                                                      max_leaf_nodes=int(max_leaf_nodes))
        grady_the_boosted.fit(X_train, np.ravel(y_train))
        confuse = confusion_matrix(y_test, grady_the_boosted.predict(X_test))
        recall = recall_calculator(confuse)
        return recall
    
    ###
    GradBoost_params = {'learning_rate' : (.1,2),
                    'n_estimators':(10,500),
                    'max_leaf_nodes':(3,50)}
    ###
    
    optimizer = BayesianOptimization(
    GradBoost_hyper,
    pbounds=GradBoost_params,
    verbose=1)
    
    ######
    
    print('Gradient Boosting')
    optimizer.maximize(init_points=15, n_iter=10)
    grad_boost_params= optimizer.max
    grad_boost_params = grad_boost_params['params']
    
    ###
    grady_the_boosted = GradientBoostingClassifier(learning_rate=grad_boost_params['learning_rate'],
                                                   n_estimators=int(grad_boost_params['n_estimators']),
                                                   max_leaf_nodes=int(grad_boost_params['max_leaf_nodes']))
    timing_buddy(grady_the_boosted, 'GradientBoostingClassifier', dicto)
    confused_buddy(grady_the_boosted, 'GradientBoostingClassifier', dicto)
    dicto['GradientBoostingClassifier']['ROC_AUC_Score'] = roc_auc_score(y_test, grady_the_boosted.predict_proba(X_test)[:, 1])
    dicto['GradientBoostingClassifier']['Hyper_Params'] = grad_boost_params
    

## Model Evaluation

#precision = tp/(tp+fp)
maximize this for spam, don't wanna be hiding real emails

#recall= tp/(tp+fn) 
Maximize this for medical or security scenarios, don't wanna miss actual sicknesses

In [15]:
def medical_evaluator(dicto):
    bestmodel = ""
    training_time= 0
    recall = 0
    temp_recall = 0
    for key, value in dicto.items():
        
        confuse = dicto[key]['confuse']
        if type(confuse) != list:
            temp_recall = recall_calculator(confuse)

            if temp_recall > recall:

                recall = temp_recall
                bestmodel = key
                training_time = dicto[key]['training_time']
    return bestmodel, recall, training_time

In [16]:
def spam_evaluator(dicto):
    bestmodel = ""
    training_time= 0
    precision = 0
    temp_precision = 0.00000001
    for key, value in dicto.items():
        confuse = dicto[key]['confuse']
        if type(confuse) != list:
            temp_precision = precision_calculator(confuse)

            if temp_precision > precision:

                precision = temp_precision
                bestmodel = key
                training_time = dicto[key]['training_time']
    return bestmodel, precision, training_time

# Model Execution

### Aggregate Function

In [17]:
def chonky_model_aggregator(X_train, y_train, X_test, y_test):
    
    #Models to explore
    model_set = ['dummy','', 'SGDClassfier', 'RandomForestClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier']
    model_stats = {}
    for each in model_set:
        model_stats[each] ={'confuse' : [], 'training_time' : 0, 'ROC_AUC_Score': 0, 'Hyper_Params': {}}
    #model running
    classy_dummy(model_stats, X_train, y_train, X_test, y_test)
    #SVCmodeler(model_stats, X_train, y_train, X_test, y_test)
    grid_sgd_model(model_stats, X_train, y_train, X_test, y_test)
    
    #bayes_rf_model(model_stats, X_train, y_train, X_test, y_test)
    
    bayes_hyper_ada_boost_model(model_stats, X_train, y_train, X_test, y_test)
    bayes_hyper_grad_boosting_model(model_stats, X_train, y_train, X_test, y_test)
    

    return model_stats



### Stats Creation

#### Wide Stats

In [18]:
wide_data_model_stats = chonky_model_aggregator(X_train, y_train, X_test, y_test)

dumbo
Stochastic Gradiant Descent
AdaBoost
|   iter    |  target   | learni... | n_esti... |
-------------------------------------------------
Gradient Boosting
|   iter    |  target   | learni... | max_le... | n_esti... |
-------------------------------------------------------------
| [95m 2       [0m | [95m 0.9862  [0m | [95m 0.9449  [0m | [95m 24.46   [0m | [95m 86.76   [0m |
| [95m 4       [0m | [95m 0.9942  [0m | [95m 1.878   [0m | [95m 43.54   [0m | [95m 48.37   [0m |
| [95m 7       [0m | [95m 0.9943  [0m | [95m 0.3165  [0m | [95m 29.84   [0m | [95m 98.72   [0m |
| [95m 11      [0m | [95m 0.9999  [0m | [95m 0.1562  [0m | [95m 32.13   [0m | [95m 33.3    [0m |
| [95m 17      [0m | [95m 1.0     [0m | [95m 0.4163  [0m | [95m 3.978   [0m | [95m 10.08   [0m |


In [19]:
wide_data_model_stats

{'dummy': {'confuse': array([[ 292, 1650],
         [1640, 8918]], dtype=int64),
  'training_time': 0.003509521484375,
  'ROC_AUC_Score': 0.5036965150961518,
  'Hyper_Params': {}},
 'SVC': {'confuse': [],
  'training_time': 0,
  'ROC_AUC_Score': 0,
  'Hyper_Params': {}},
 'SGDClassfier': {'confuse': array([[    0,  1942],
         [    0, 10558]], dtype=int64),
  'training_time': 0.050966739654541016,
  'ROC_AUC_Score': 0.669619744517509,
  'Hyper_Params': {'Loss': 'modified_huber', 'penalty': 'l2'}},
 'RandomForestClassifier': {'confuse': [],
  'training_time': 0,
  'ROC_AUC_Score': 0,
  'Hyper_Params': {}},
 'AdaBoostClassifier': {'confuse': array([[    0,  1942],
         [    0, 10558]], dtype=int64),
  'training_time': 4.501019239425659,
  'ROC_AUC_Score': 0.6877135109109428,
  'Hyper_Params': {'learning_rate': 0.2786209262205007,
   'n_estimators': 274.02810984445074}},
 'GradientBoostingClassifier': {'confuse': array([[    0,  1942],
         [    0, 10558]], dtype=int64),
  'tr

In [28]:
from pprint import pprint
pprint(wide_data_model_stats)

{'AdaBoostClassifier': {'Hyper_Params': {'learning_rate': 0.2786209262205007,
                                         'n_estimators': 274.02810984445074},
                        'ROC_AUC_Score': 0.6877135109109428,
                        'confuse': array([[    0,  1942],
       [    0, 10558]], dtype=int64),
                        'training_time': 4.501019239425659},
 'GradientBoostingClassifier': {'Hyper_Params': {'learning_rate': 0.4163419878067033,
                                                 'max_leaf_nodes': 3.977698440727402,
                                                 'n_estimators': 10.075609959280012},
                                'ROC_AUC_Score': 0.6833459928765806,
                                'confuse': array([[    0,  1942],
       [    0, 10558]], dtype=int64),
                                'training_time': 0.22296905517578125},
 'SGDClassfier': {'Hyper_Params': {'Loss': 'modified_huber', 'penalty': 'l2'},
                  'ROC_AUC_Score': 0.66961974

In [24]:
del wide_data_model_stats['RandomForestClassifier']

In [29]:
del wide_data_model_stats['SVC']

# Stats Examination

In [30]:
print(medical_evaluator(wide_data_model_stats))

('SGDClassfier', 1.0, 0.050966739654541016)


In [31]:
time_dict= {}
for key, value in wide_data_model_stats.items():
    time_dict[key] = value['training_time']
    time_list = value['training_time']

time_frame = pd.Series(time_dict)
time_frame.sort_values()

dummy                         0.003510
SGDClassfier                  0.050967
GradientBoostingClassifier    0.222969
AdaBoostClassifier            4.501019
dtype: float64

In [32]:
confuse_dict= {}
for key, value in wide_data_model_stats.items():
    confuse_dict[key] = value['confuse']

confuse_frame = pd.Series(confuse_dict)

precision_dict= {}
recall_dict = {}
for i in confuse_frame.index:
    precision_dict[i] = precision_calculator(confuse_frame[i])        
    recall_dict[i] = recall_calculator(confuse_frame[i])        

precision_frame = pd.Series(precision_dict)


recall_frame = pd.Series(recall_dict)

recall_frame = dict(recall_frame)

[[ 292 1650]
 [1640 8918]]
[[    0  1942]
 [    0 10558]]
[[    0  1942]
 [    0 10558]]
[[    0  1942]
 [    0 10558]]


In [37]:
working_list = (recall_frame, precision_frame, time_frame)
list_of_three = pd.DataFrame(data=working_list, index=['Recall','Precision','Time_To_Train'])
list_of_three= list_of_three.transpose()
list_of_three.loc['GradientBoostingClassifier'] #recall
#list_of_three.loc['SVC'] #precision
list_of_three.loc['SGDClassfier'] #time
#list_of_three.sort_values('Precision', ascending=False)
list_of_three.sort_values('Time_To_Train')



Unnamed: 0,Recall,Precision,Time_To_Train
dummy,0.844668,0.843868,0.00351
SGDClassfier,1.0,0.84464,0.050967
GradientBoostingClassifier,1.0,0.84464,0.222969
AdaBoostClassifier,1.0,0.84464,4.501019


In [34]:
best_med_model, med_recall, med_time = medical_evaluator(wide_data_model_stats)
best_spam_model, spam_precision, spam_time = spam_evaluator(wide_data_model_stats)
print("The best medical model (Optimizing for recall) is " + best_med_model + " It's recall score was" + str(round(med_recall,3)) + " and it took " + str(round(med_time,2)) +" seconds to run.")
print("")
print("The best spam model (Optimizing for precision) is " + best_spam_model +
      " It's precision score was" +str(round(spam_precision, 3)) +" and it took " + str(round(spam_time, 2) ) +" seconds to run.")

[[ 292 1650]
 [1640 8918]]
[[    0  1942]
 [    0 10558]]
[[    0  1942]
 [    0 10558]]
[[    0  1942]
 [    0 10558]]
The best medical model (Optimizing for recall) is SGDClassfier It's recall score was1.0 and it took 0.05 seconds to run.

The best spam model (Optimizing for precision) is SGDClassfier It's precision score was0.845 and it took 0.05 seconds to run.


In [35]:
for key, value in wide_data_model_stats.items():
    if value['training_time'] !=0:
        print(key)
        print('')
        for key2, value2 in value.items():
            print(key2)
            if key2 == 'confuse':
                if type(value2) != list:
                    print('recall: ' + str(round(recall_calculator(value2),2)))
                    print('precision: ' +str(round(precision_calculator(value2),2)))
            print(value2)
            print('')
        print('')
        print('')


dummy

confuse
recall: 0.84
[[ 292 1650]
 [1640 8918]]
precision: 0.84
[[ 292 1650]
 [1640 8918]]

training_time
0.003509521484375

ROC_AUC_Score
0.5036965150961518

Hyper_Params
{}



SGDClassfier

confuse
recall: 1.0
[[    0  1942]
 [    0 10558]]
precision: 0.84
[[    0  1942]
 [    0 10558]]

training_time
0.050966739654541016

ROC_AUC_Score
0.669619744517509

Hyper_Params
{'Loss': 'modified_huber', 'penalty': 'l2'}



AdaBoostClassifier

confuse
recall: 1.0
[[    0  1942]
 [    0 10558]]
precision: 0.84
[[    0  1942]
 [    0 10558]]

training_time
4.501019239425659

ROC_AUC_Score
0.6877135109109428

Hyper_Params
{'learning_rate': 0.2786209262205007, 'n_estimators': 274.02810984445074}



GradientBoostingClassifier

confuse
recall: 1.0
[[    0  1942]
 [    0 10558]]
precision: 0.84
[[    0  1942]
 [    0 10558]]

training_time
0.22296905517578125

ROC_AUC_Score
0.6833459928765806

Hyper_Params
{'learning_rate': 0.4163419878067033, 'max_leaf_nodes': 3.977698440727402, 'n_estimators

In [36]:
#Save this iteration.

pickle.dump( wide_data_model_stats, open( "model_stats_02122022.p", "wb" ) )

In [None]:
pickles = ['model_stats_01012022.p','model_stats_01022022.p', 'model_stats_01032022.p', 'model_stats_01222022.p', "model_stats_01242022.p", "model_stats_01252022.p", "model_stats_01262022.p", "model_stats_02122022.p"]
unpickles = []
for each in pickles:
    file = open(each, 'rb')
    unpickles.append(pickle.load(file))
    file.close()


In [None]:
for each in unpickles:
    best_spam_model, spam_precision, spam_time = spam_evaluator(each)
    print(best_spam_model, round(spam_precision,2), round(spam_time,2))