<a href="https://colab.research.google.com/github/R-Madhuram/UC-BerkeleyCapstone_CreditCardFraudDetection/blob/main/notebooks/AutoML_ensembleModels/14_Auto_ML_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Auto ML Model Using Pycaret

In [None]:
!pip install --pre pycaret

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
import xgboost

In [None]:
import pandas as pd
import numpy as np
import pickle

## Importing helper functions



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import the helper function files
# Importing the helper_functions file
!cp /content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Notebooks/helper_functions.py helper_functions.py
!cp /content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Notebooks/helper_functions_ml.py helper_functions_ml.py

In [None]:
# Import the helper functions
# Import the required functions from the helper function file
from helper_functions_ml import createMetricsDF,computeModelMetrics,computeAndPlotMetrics

## Import the Data (training and testing)

In [None]:
# Import training data
!cp '/content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Classification_products/df.csv' 'df.csv'

In [None]:
# Read the training data
df_train = pd.read_csv('df.csv')
print(df_train.head())

df_train = df_train.drop('Unnamed: 0', axis=1)
df_train.head(3)
print(df_train.shape)

   Unnamed: 0  TransactionID  isFraud  TransactionDT  TransactionAmt  \
0           0        2987000        0          86400        1.835691   
1           1        2987001        0          86401        1.462398   
2           2        2987002        0          86469        1.770852   
3           3        2987003        0          86499        1.698970   
4           4        2987004        0          86506        1.698970   

   ProductCD  card1  card2  card3  card4  ...  id_36  id_37  id_38  \
0          4  13926   -1.0  150.0      1  ...      2      2      2   
1          4   2755  404.0  150.0      2  ...      2      2      2   
2          4   4663  490.0  150.0      4  ...      2      2      2   
3          4  18132  567.0  150.0      2  ...      2      2      2   
4          1   4497  514.0  150.0      2  ...      0      1      1   

   DeviceType  DeviceInfo  P_emaildomain_addr1_card1  card1_card2  \
0           1        1735                      63363         3655   
1       

In [None]:
# CP cannot be used to copy file here because numFolds is a variable and hence the filenames will change based on the number of folds.
# instead of using linux CP command we use SHUTIL copy 

# read the K fold train and dev indices

numFolds = 3

import shutil # file copy package. 

srcFileName = '/content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Classification_products/train_dev_indices_' + str(numFolds) + '.pickle' 
train_dev_indices_file = 'train_dev_indices_' + str(numFolds) + '.pickle'
# !cp  srcFileName destFileName
shutil.copyfile(srcFileName, train_dev_indices_file)


'train_dev_indices_3.pickle'

In [None]:
# read the train dev indices file
# train_dev_indices_df = pd.read_csv(train_dev_indices_file)
# train_dev_indices_df.head()

with open(train_dev_indices_file, 'rb') as handle:
    train_dev_indices = pickle.load(handle)
train_dev_indices

{'train': [array([195756, 195757, 195758, ..., 590537, 590538, 590539]),
  array([     0,      1,      2, ..., 590537, 590538, 590539]),
  array([     0,      1,      2, ..., 397410, 397417, 397418])],
 'dev': [array([     0,      1,      2, ..., 221151, 221172, 221182]),
  array([195756, 195757, 195758, ..., 397410, 397417, 397418]),
  array([393481, 393482, 393483, ..., 590537, 590538, 590539])]}

In [None]:
# Getting dataready
X = df_train.drop(columns = ['isFraud','TransactionID'])
y = df_train['isFraud']

In [None]:
# List of categorical and numerical columns in merged dataset

## Numerical columns

num_column_lst_final = []
for icol in X.columns:
  if X[icol].dtypes in ['float64', 'int64']:
    num_column_lst_final.append(icol)
print(f"The columns that have numerical features are: '{num_column_lst_final}'")
print(f"The number of columns that have numerical features are: '{len(num_column_lst_final)}'")

print('\n')

## Categorical Columns
cat_column_lst_final = []
for icol in X.columns:
  if X[icol].dtypes in ['object']:
    cat_column_lst_final.append(icol)
print(f"The columns that have categorical features are: '{cat_column_lst_final}'")
print(f"The number of columns that have categorical features are: '{len(cat_column_lst_final)}'")


The columns that have numerical features are: '['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C3', 'C5', 'D1', 'D3', 'D4', 'D8', 'D9', 'D10', 'D13', 'D14', 'M1', 'M2', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V4', 'V6', 'V10', 'V12', 'V14', 'V15', 'V19', 'V23', 'V25', 'V27', 'V35', 'V37', 'V39', 'V44', 'V46', 'V53', 'V55', 'V61', 'V66', 'V75', 'V77', 'V82', 'V86', 'V95', 'V98', 'V99', 'V104', 'V107', 'V108', 'V109', 'V117', 'V118', 'V120', 'V121', 'V123', 'V124', 'V129', 'V135', 'V138', 'V139', 'V141', 'V144', 'V148', 'V161', 'V169', 'V170', 'V172', 'V173', 'V174', 'V184', 'V194', 'V208', 'V209', 'V214', 'V220', 'V221', 'V223', 'V224', 'V226', 'V227', 'V228', 'V238', 'V240', 'V241', 'V250', 'V260', 'V270', 'V281', 'V282', 'V284', 'V286', 'V288', 'V290', 'V300', 'V305', 'V313', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08',

## Models

In [None]:
from pycaret.classification import setup,create_model,tune_model

In [None]:
exp = setup(data= df_train, target='isFraud',session_id=42,use_gpu=True,fix_imbalance=True)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,isFraud
2,Target type,Binary
3,Original data shape,"(590540, 157)"
4,Transformed data shape,"(974990, 157)"
5,Transformed train set shape,"(797828, 157)"
6,Transformed test set shape,"(177162, 157)"
7,Numeric features,156
8,Preprocess,True
9,Imputation type,simple


In [None]:
# xgboost model
xgb = create_model('xgboost')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9774,0.9181,0.4177,0.8691,0.5642,0.5541,0.5937
1,0.9783,0.9204,0.4385,0.883,0.586,0.5761,0.6137
2,0.9772,0.9129,0.417,0.8578,0.5612,0.5509,0.589
3,0.9783,0.9215,0.4329,0.8879,0.5821,0.5722,0.6115
4,0.9783,0.9226,0.452,0.8605,0.5927,0.5826,0.6147
5,0.977,0.9152,0.4174,0.8495,0.5598,0.5494,0.5863
6,0.9778,0.9213,0.4375,0.8601,0.5799,0.5698,0.6044
7,0.978,0.9159,0.4319,0.8778,0.579,0.569,0.6071
8,0.9768,0.9158,0.4066,0.8522,0.5506,0.5402,0.5795
9,0.9777,0.9143,0.4343,0.8568,0.5764,0.5662,0.6009


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
exp = setup(data= df_train, target='isFraud',session_id=42,use_gpu=True,fix_imbalance=True)
lightgbm = create_model('lightgbm')

Unnamed: 0,Description,Value
0,Session id,42
1,Target,isFraud
2,Target type,Binary
3,Original data shape,"(590540, 157)"
4,Transformed data shape,"(974990, 157)"
5,Transformed train set shape,"(797828, 157)"
6,Transformed test set shape,"(177162, 157)"
7,Numeric features,156
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9743,0.8979,0.3582,0.7957,0.494,0.4828,0.5237
1,0.9748,0.9026,0.3707,0.8024,0.5071,0.496,0.5353
2,0.9735,0.8875,0.3437,0.7741,0.4761,0.4645,0.5053
3,0.9753,0.8999,0.3817,0.813,0.5195,0.5085,0.5472
4,0.9743,0.9018,0.3725,0.7767,0.5035,0.492,0.5273
5,0.9745,0.8896,0.3566,0.805,0.4943,0.4831,0.5258
6,0.974,0.9022,0.3683,0.7702,0.4984,0.4867,0.522
7,0.9742,0.89,0.3683,0.7792,0.5002,0.4887,0.5252
8,0.9728,0.8877,0.3278,0.756,0.4573,0.4456,0.487
9,0.9742,0.8944,0.3686,0.7747,0.4995,0.488,0.5238


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
!pip install pycaret[full]

In [None]:
from pycaret.classification import setup,create_model,tune_model,stack_models 

In [None]:
exp = setup(data= df_train, target='isFraud',session_id=42,use_gpu=True,fix_imbalance=True)
catboost = create_model('catboost')

Unnamed: 0,Description,Value
0,Session id,42
1,Target,isFraud
2,Target type,Binary
3,Original data shape,"(590540, 157)"
4,Transformed data shape,"(974990, 157)"
5,Transformed train set shape,"(797828, 157)"
6,Transformed test set shape,"(177162, 157)"
7,Numeric features,156
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9748,0.8954,0.3582,0.8196,0.4986,0.4877,0.5321
1,0.9756,0.9036,0.3686,0.8487,0.514,0.5035,0.5501
2,0.9749,0.8857,0.3617,0.8185,0.5017,0.4908,0.5343
3,0.9756,0.9,0.3728,0.8422,0.5168,0.5062,0.5509
4,0.9757,0.8996,0.3856,0.8279,0.5262,0.5154,0.5554
5,0.9749,0.8915,0.3545,0.8314,0.4971,0.4863,0.5334
6,0.9744,0.8968,0.3663,0.791,0.5007,0.4894,0.528
7,0.975,0.8926,0.3642,0.8247,0.5053,0.4944,0.5384
8,0.9743,0.8881,0.3451,0.814,0.4847,0.4737,0.5202
9,0.9753,0.8943,0.3721,0.829,0.5136,0.5028,0.5458


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
ada = create_model('ada')

Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9421,0.8328,0.3921,0.2726,0.3216,0.2924,0.2977
1,0.9365,0.8359,0.417,0.2528,0.3148,0.2836,0.2934
2,0.9458,0.8179,0.3748,0.2886,0.3261,0.2984,0.3012
3,0.9418,0.8323,0.4018,0.2739,0.3258,0.2965,0.3025
4,0.9415,0.8368,0.4375,0.2827,0.3435,0.3143,0.3225
5,0.9415,0.8185,0.3801,0.2653,0.3125,0.2829,0.2879
6,0.9448,0.828,0.4091,0.2932,0.3416,0.3136,0.3184
7,0.9416,0.8282,0.4202,0.2786,0.3351,0.3059,0.3129
8,0.9408,0.8222,0.3755,0.2599,0.3072,0.2773,0.2825
9,0.9408,0.8277,0.4274,0.2761,0.3355,0.306,0.314


In [None]:
stacker_ada = stack_models(estimator_list = [lightgbm,xgb,catboost],meta_model=ada)

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9774,0.8802,0.444,0.8316,0.5789,0.5684,0.5981
1,0.9779,0.8681,0.453,0.8441,0.5896,0.5793,0.6091
2,0.9766,0.8558,0.4281,0.8155,0.5615,0.5506,0.581
3,0.978,0.8732,0.4481,0.8526,0.5875,0.5773,0.609
4,0.9778,0.8741,0.4762,0.8135,0.6007,0.5901,0.6126
5,0.9765,0.8656,0.4319,0.8065,0.5626,0.5516,0.5802
6,0.9777,0.8775,0.4547,0.8329,0.5883,0.5779,0.606
7,0.9779,0.8713,0.4513,0.8448,0.5883,0.578,0.6082
8,0.9765,0.8691,0.4288,0.8083,0.5603,0.5494,0.5788
9,0.977,0.8722,0.4537,0.8049,0.5803,0.5694,0.5943


In [None]:
stacker_lightgbm = stack_models(estimator_list = [lightgbm,xgb,catboost],meta_model=lightgbm)

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9772,0.8899,0.4246,0.8469,0.5656,0.5552,0.5904
1,0.9783,0.8927,0.4426,0.8779,0.5885,0.5786,0.6147
2,0.9772,0.8817,0.4232,0.85,0.5651,0.5547,0.5906
3,0.9779,0.8861,0.4295,0.8734,0.5758,0.5658,0.6037
4,0.9779,0.8975,0.4499,0.8477,0.5878,0.5776,0.6084
5,0.9775,0.8869,0.4395,0.8402,0.5771,0.5667,0.5983
6,0.9773,0.8871,0.4388,0.8344,0.5752,0.5647,0.5957
7,0.9781,0.8833,0.4471,0.8581,0.5879,0.5778,0.6104
8,0.9768,0.8838,0.4156,0.8429,0.5567,0.5463,0.5826
9,0.9774,0.8827,0.4412,0.8329,0.5769,0.5663,0.5967
