### <center> Model Delivering </center>

This notebook it's about optimizing, training and deliver the final models. We will get some artifacts to preprocess features.

In [3]:
import time
import gc
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_val_score

#SELECTED MODELS

from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#LOAD DATA.
data_path=(
    '../input/',
    )

#ARTIFACTS PATH
artifacts_path = (
    '../artifacts/models/',
    '../artifacts/'
    )

train = pd.read_csv(data_path[0]+'train.csv')
test = pd.read_csv(data_path[0]+'test.csv')

print('TRAIN SHAPE: {}\nTEST SHAPE:{}'.format(train.shape, test.shape))

#DEFINE Features and target
train.drop('Time',axis=1, inplace=True)

features = train.drop('Class', axis=1).columns.to_list()
target = 'Class'

TRAIN SHAPE: (256326, 33)
TEST SHAPE:(28481, 33)


**Reducing data Memory:**

*Source: https://gist.github.com/fujiyuu75/748bc168c9ca8a49f86e144a08849893*

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
        #    df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 62.58 MB
Memory usage after optimization is: 15.40 MB
Decreased by 75.4%
Memory usage of dataframe is 7.17 MB
Memory usage after optimization is: 1.82 MB
Decreased by 74.6%


### Preprocessing

The preprocessing steps will be:

- Normalize data: artifact needed is mean and std for each data column.

- Create extra features: based on the previous automate feat eng.

In [5]:
from sklearn.preprocessing import Normalizer

artifacts_path = (
    '../artifacts/models/',
    '../artifacts/'
    )

x_train = train[features]
x_test = test[features]

all_data = pd.concat([x_train, x_test],axis=0) #Train the Normalizer with all the data

normalizer = Normalizer().fit(all_data) # Fit inot the model

norm_train = normalizer.transform(x_train.values)
x_train = pd.DataFrame(norm_train, index=x_train.index, columns=x_train.columns)

pickle.dump(normalizer, open(artifacts_path[1]+'normalizer.sav', 'wb')) #Save Normalizer as artifact

### Build and Tune Model

In [6]:
from sklearn.metrics import precision_recall_curve, auc, f1_score, recall_score, precision_score

def auc_precision_recall_curve(y_true, y_preds):
    """Kaggle official doc from the data recommends this metric
    """
    precision, recall, thresholds = precision_recall_curve(y_true, y_preds)
    #AUC function to calculate AUC of precision recall curve
    auc_precision_recall = auc(recall, precision)
    return auc_precision_recall

def compute_scores(y_true,y_preds):
    """Return a dictionary of results.
    
    It computes 4 metrics for Fraud Detection interests.
    Arguments:
    
    y_true: real labeled data
    y_preds: prediction from the model
    """
    
    scores = {
        'AUC-PRC': auc_precision_recall_curve(y_true, y_preds),
        'F1-score': f1_score(y_true, y_preds),
        'Recall': recall_score(y_true, y_preds),
        'Precision': precision_score(y_true, y_preds)
    }
    
    return scores

def score_report(score_dict, train=False):
    """Printed scores report
    score_dict: dict from compute_scores function output.
    """
    if train:
        print('---'*10)
        print('Train Scores:\n')
        for score_name, score_value in score_dict.items():
            print(f"{score_name}:  {score_value}")
    else:
        print('Test Scores:\n')
        for score_name, score_value in score_dict.items():
            print(f"{score_name}:  {score_value}")

In [7]:

def build_model(model, train_data, test_data, features, target, name, 
                artifacts_path='../artifacts/models/', tune=False, param_grid=None, final=True):
    """ Train a Sklearn format model and make the final test
    --------------
    Parameters:
    model: Model Instance.
    train_dataset: training pd.DataFrame dataset
    test_dataset: HoldOut pd.DataFrame dataset
    features: List of features to be included.
    target: target name
    name: name to store the model
    tune: If tuning job is calling.
    returns a model instance
    """
    
    #  for testing against test_data(not seeing until this point)
    
    x_train, y_train = train_data[features], train_data[target]
    x_test, y_test = test_data[features], test_data[target]
    
    if tune == True:
        
        scorer = make_scorer(auc_precision_recall_curve, 
                             greater_is_better=True) #To optimize over AUC-RC
        
        model, results = tuning_job(
                            model=model,
                            data=train_data,
                            features=features,
                            target=target,
                            param_grid=param_grid,
                            cv=cv,
                            scoring=scorer
        )
        
    elif tune == False:
        model.fit(x_train, y_train)
    
    if isinstance(model, XGBClassifier):
        y_test_pred = model.predict(np.float32(x_test))
    else:
        y_test_pred = model.predict(x_test)
    
    #SCORES
    test_results = compute_scores(y_test, y_test_pred)
    score_report(test_results)
    
    #STORE
    filename = artifacts_path + name +'.sav'
    pickle.dump(model, open(filename, 'wb'))
    
    # INFERENCE TIME
    random_sample = x_test.sample(n=1)
    
    start = time.time()
    one_inf = model.predict(np.float32(random_sample))
    end = time.time()
    
    print('One inference time:', end - start)
    
    #Re-train with all data if final==True
    if final==True: #if model is final
        all_data = pd.concat([train_data, test_data], axis=1)
        model.fit(all_data[features], all_data[target])
        return model
    
    return model

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer

def tuning_job(model, data, features, target, param_grid,scorer, cv=5):
    """Tunes the model and outputs best fit.
    ----------
    Parameters:
    model: Sklearn instance or Sklearn wrapper
    data: x_train data.
    param_grid: search space of hyperparameters
    """
    
    grid_search = GridSearchCV(
                    estimator = model,
                    param_grid=param_grid,
                    cv=cv,
                    scoring=scorer
    )
    
    grid_search.fit(np.float32(data[features]), np.float32(data[target]))
    
    results = grid_search.cv_results_
    
    best_estimator = grid_search.best_estimator_
    
    return best_estimator, results

### Model Delivering:

1.- LDA

LDA does not need to be tuned. 

*Source: https://datascience.stackexchange.com/questions/21942/linear-discriminant-analysis-which-parameters-can-be-tunned-in-cross-validation*

In [10]:
#FOR LDA. 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis()
features = train.drop('Class', axis=1).columns.to_list()
target = 'Class'
name='first_lda'

lda_model = build_model(model, train, test, features, target, name)

Test Scores:

AUC-PRC:  0.7077814377743654
F1-score:  0.7032967032967032
Recall:  0.6530612244897959
Precision:  0.7619047619047619
One inference time: 0.000997781753540039


2.- Extra Trees

In [26]:
model = ExtraTreesClassifier(n_jobs=-1)
name = 'first_et'
et = build_model(model, train, test, features, target, name)

Test Scores:

AUC-PRC:  0.8179149508193432
F1-score:  0.8045977011494253
Recall:  0.7142857142857143
Precision:  0.9210526315789473
One inference time: 0.022939443588256836


In [30]:
#Tune model: test
param_grid = {'max_depth':[None, 5,10,12], 'min_impurity_decrease':[0,0.2,0.5]}

scorer = make_scorer(auc_precision_recall_curve, 
                             greater_is_better=True)

best_et, results = tuning_job(model, train, features, target, param_grid,scorer, cv=5)

GrindSearch Results:
 {'mean_fit_time': array([26.84591489,  4.64906516,  3.36830096,  7.20164361,  3.9055057 ,
        3.96377382, 11.15993519,  3.57548676,  3.15751328, 13.71353145,
        3.14232178,  2.44076271]), 'std_fit_time': array([5.0747395 , 0.87329246, 0.73621437, 1.00676932, 1.47386738,
       1.02449758, 0.23057305, 0.42737733, 0.40138549, 1.95225189,
       1.10233683, 0.20201187]), 'mean_score_time': array([1.06514654, 0.30422077, 0.22383952, 0.27955313, 0.22403827,
       0.23425202, 0.41798396, 0.27053308, 0.20831175, 0.4546319 ,
       0.26361718, 0.18921418]), 'std_score_time': array([0.41180104, 0.07648922, 0.05909967, 0.08515743, 0.04513269,
       0.03960724, 0.02948248, 0.03128417, 0.06054472, 0.1911039 ,
       0.08305488, 0.02321461]), 'param_max_depth': masked_array(data=[None, None, None, 5, 5, 5, 10, 10, 10, 12, 12, 12],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_va

In [31]:
name='tuned_et'
build_model(best_et, train, test, features, target, name)

Test Scores:

AUC-PRC:  0.8291136226376261
F1-score:  0.8181818181818182
Recall:  0.7346938775510204
Precision:  0.9230769230769231
One inference time: 0.023936748504638672


ExtraTreesClassifier(min_impurity_decrease=0, n_jobs=-1)

3.- XGBOOST

In [33]:
model = XGBClassifier()
name = 'first_xgb'
xgb_ = build_model(model, train, test, features, target, name)

Test Scores:

AUC-PRC:  0.8289811993635815
F1-score:  0.8222222222222222
Recall:  0.7551020408163265
Precision:  0.9024390243902439
One inference time: 0.0059850215911865234


In [41]:
model = XGBClassifier(learning_rate=0.03)

param_grid = {'eta':[0.1, 0.01, 0.4],
              'max_depth':[5, 20, None]}

xgb_tuned, results = tuning_job(model, train, features, target, param_grid,scorer, cv=5)





In [42]:
xgb_tuned.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': 'gain',
 'interaction_constraints': '',
 'learning_rate': 0.400000006,
 'max_delta_step': 0,
 'max_depth': 20,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': 4,
 'num_parallel_tree': 1,
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None,
 'eta': 0.4}

In [43]:
name = 'tuned_xgb'
tuned_xgb = build_model(xgb_tuned, train, test, features, target, name)

Test Scores:

AUC-PRC:  0.8175751610991646
F1-score:  0.8089887640449438
Recall:  0.7346938775510204
Precision:  0.9
One inference time: 0.015623331069946289


### Making a Stacked Model:

In [44]:
estimators = [
    ('et', best_et),
    ('lda', lda_model),
    ('xgb', tuned_xgb
    )
]

clf = StackingClassifer(estimators=estimators, final_estimator)

SyntaxError: positional argument follows keyword argument (<ipython-input-44-4b498f0a1e2f>, line 9)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,Amount,sin_time,cos_time,Class
0,2.080078,-0.005001,-2.033203,-0.024338,0.354004,-1.508789,0.608398,-0.379883,0.139404,0.223022,...,-0.018539,0.040619,0.338135,0.225586,-0.094727,-0.092041,0.770020,-0.994629,-0.105774,0
1,2.037109,0.048676,-1.794922,0.245361,0.567871,-0.350342,0.025803,-0.016663,0.334473,-0.201294,...,0.311035,0.181396,-0.291504,0.175659,-0.067505,-0.044617,2.679688,-0.739258,-0.673340,0
2,-0.523926,-0.564453,1.616211,-1.689453,-0.859863,1.282227,-0.690918,0.151733,0.566895,0.703125,...,0.030853,0.221191,-1.251953,1.052734,-0.499023,-0.010262,45.812500,0.289307,-0.957031,0
3,1.966797,-0.220825,-0.377930,0.177124,-0.306885,-0.077576,-0.530762,0.073242,0.749023,0.008385,...,0.391846,0.744629,-0.424072,-0.674316,0.032074,-0.031464,1.179688,-0.319580,-0.947754,0
4,1.350586,-0.152100,-0.045654,-0.059753,-0.416748,-0.672363,-0.224487,-0.044922,0.641602,-0.049011,...,-0.146240,-0.432373,0.436523,1.174805,-0.097656,-0.012512,3.119141,0.114746,-0.993164,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256321,-0.582520,0.749023,1.481445,1.020508,0.868164,1.958008,0.080078,0.765625,-0.862305,-0.042053,...,-0.260498,-1.380859,0.008530,0.095459,0.153320,0.051178,2.380859,-0.333008,-0.942871,0
256322,-1.354492,3.476562,-1.436523,4.214844,1.027344,0.183228,0.783691,-0.039154,-0.199585,3.876953,...,0.149292,-0.827148,-0.340820,-0.000825,0.383545,-0.164429,2.689453,-0.815918,-0.578125,0
256323,1.758789,-1.265625,-1.185547,-0.483398,-1.098633,-0.801758,-0.773438,-0.062561,-0.176270,0.181641,...,-0.043243,-0.020050,-0.265381,-0.130371,-0.008781,0.008514,189.000000,-0.982422,-0.186646,0
256324,1.187500,-0.371094,0.911133,0.794434,-1.244141,-0.788574,-0.322754,-0.192139,-0.731445,0.598633,...,0.128662,0.879395,0.350586,-0.564453,0.062500,0.046387,49.968750,-0.516113,0.856445,0


STACKING:

In [None]:
from sklearn.ensemble import StackingClassifier

