In [None]:
!pip install ydata-profiling
!pip install flaml[notebook]

In [2]:
import gc
import warnings
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import multiprocessing as mp
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
#from ydata_profiling import ProfileReport
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder
from category_encoders.binary import BinaryEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbpipeline 
import matplotlib.pyplot as plt

#### Read the data into a dataframe

In [3]:
df = pd.read_csv("../input/credit-card-transactions/credit_card_transactions-ibm_v2.csv",
                 dtype = {'Year' : 'int16', 'Month' : 'int8', 'Day' : 'int8',
                           'Use Chip' : 'category', 'MCC' : 'int16', 'Is Fraud?' : 'category' , 
                            'Merchant City' : 'category', 'Amount' : 'string'})
df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Is Fraud?'],axis=1), df['Is Fraud?'], test_size = 0.3, random_state = 1613,shuffle=True )

In [6]:
profile = ProfileReport(df, title="Data Report", explorative = True, minimal = True, dark_mode=True )
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [None]:
profile.to_notebook_iframe()

In [5]:
del df
gc.collect()

93

In [6]:
def clean(df):
    df['Zip'].fillna(0, inplace=True)
    df['Amount'] = df['Amount'].apply(lambda value: float(value.split("$")[1]))
    df['Hour'] = df['Time'].apply(lambda value: int(value.split(":")[0]))
    df['Minutes'] = df['Time'].apply(lambda value: int(value.split(":")[1]))
    df.drop(['Time'], axis=1, inplace=True)
    df['Merchant State'].fillna('NA', inplace=True)
    df['Merchant State'] = df['Merchant State'].astype('category')
    df['Errors?'].fillna('None', inplace=True)
    df['Errors?'] = df['Errors?'].astype('category')

    
    cat_col = ['Merchant State','Use Chip', 'Merchant City','Errors?']
    be = BinaryEncoder()
    enc_df= pd.DataFrame(be.fit_transform(df[cat_col]), dtype= 'int8' )  

    df.drop(cat_col, axis=1, inplace = True)
    df = pd.concat([df,enc_df], axis=1)
    
    for col in df.columns:
        df[col] =  df[col].astype(float)
    return pd.DataFrame(df)    
     
    
preprocessing_pipeline = Pipeline([
    ('cleaning', FunctionTransformer(clean))
], verbose=True) 


In [7]:
X_train, y_train = RandomUnderSampler(random_state=1613 , sampling_strategy= 0.01).fit_resample(X_train,y_train)

In [8]:
X_train = pd.DataFrame(preprocessing_pipeline.fit_transform(X_train), dtype= 'float')

[Pipeline] .......... (step 1 of 1) Processing cleaning, total=  17.3s


In [9]:
X_test = pd.DataFrame(preprocessing_pipeline.fit_transform(X_test), dtype= 'float')

[Pipeline] .......... (step 1 of 1) Processing cleaning, total= 1.1min


In [10]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

### Tuning XGBoost

In [None]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 1800, 
    "metric": 'macro_f1', 
    "estimator_list": ['xgboost'],  
    "task": 'classification',  
    "log_file_name": 'xgb.log',  
    "seed": 1613,  
}
automl.fit(X_train=X_train, y_train=y_train, **settings, n_jobs = 8 )

In [23]:
print('Best hyperparmeter config:', automl.best_config)
print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best hyperparmeter config: {'n_estimators': 295, 'max_leaves': 810, 'min_child_weight': 6.150041099125579, 'learning_rate': 0.06894448972317331, 'subsample': 0.8618758064542394, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.020070123048846998, 'reg_lambda': 0.49934036422839806}
Best r2 on validation data: 0.9435
Training duration of best run: 602.2 s


In [33]:
import xgboost as xgb

In [None]:
auto_model = xgb.XGBClassifier(n_estimators= 295,
                               max_leaves = 810, min_child_weight = 6.150041099125579,
                               learning_rate = 0.06894448972317331,
                               subsample = 0.8618758064542394,
                               colsample_bylevel =1.0, 
                               colsample_bytree = 1.0, 
                               reg_alpha = 0.020070123048846998,  
                               reg_lambda = 0.49934036422839806, 
                               n_jobs = 8 , verbosity = 2)


auto_model.fit(X_train, y_train)

In [35]:
import pickle

Pkl_Filename = "XGB_Tuned.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(auto_model, file)

In [37]:
y_pred = auto_model.predict(X_test)

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   7307107
           1       0.72      0.61      0.66      8963

    accuracy                           1.00   7316070
   macro avg       0.86      0.80      0.83   7316070
weighted avg       1.00      1.00      1.00   7316070



# Tuning catboost

In [None]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 1800, 
    "metric": 'macro_f1', 
    "estimator_list": ['catboost'],  
    "task": 'classification',  
    "log_file_name": 'cat.log',  
    "seed": 1613,  
}
automl.fit(X_train=X_train, y_train=y_train, **settings, n_jobs = 8 )

In [14]:
print('Best hyperparmeter config:', automl.best_config)
print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best hyperparmeter config: {'early_stopping_rounds': 10, 'learning_rate': 0.09999999999999996, 'n_estimators': 303}
Best r2 on validation data: 0.9247
Training duration of best run: 53.19 s


In [15]:
from catboost import CatBoostClassifier

In [None]:
catboost = CatBoostClassifier(early_stopping_rounds= 20, learning_rate=0.1, n_estimators=350)
catboost.fit(X_train, y_train )

In [35]:
y_pred = catboost.predict(X_test)

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   7307107
           1       0.73      0.53      0.61      8963

    accuracy                           1.00   7316070
   macro avg       0.86      0.77      0.81   7316070
weighted avg       1.00      1.00      1.00   7316070



In [38]:
import pickle

Pkl_Filename = "CatBoostModelCC.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(catboost, file)

In [None]:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   7307107
           1       0.72      0.52      0.61      8963

    accuracy                           1.00   7316070
   macro avg       0.86      0.76      0.80   7316070
weighted avg       1.00      1.00      1.00   7316070

In [1]:
# CatBoost is faster but with 2% less macro f1 score