In [1]:
#Data Manipulation
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

import warnings
warnings.filterwarnings("ignore")

# Set Options
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
%matplotlib inline


In [2]:
# Uploading saved dataframe csv file from google drive
! gdown --id 1cfNXnd6oMsqb0gH-yn54Rtaf7VgCsDSX

Downloading...
From (original): https://drive.google.com/uc?id=1cfNXnd6oMsqb0gH-yn54Rtaf7VgCsDSX
From (redirected): https://drive.google.com/uc?id=1cfNXnd6oMsqb0gH-yn54Rtaf7VgCsDSX&confirm=t&uuid=943ce019-fb7f-4569-9f88-fd63f186093d
To: /content/df_1.csv
100% 788M/788M [00:08<00:00, 90.1MB/s]


In [3]:
%%time
# Load Data
df   = pd.read_csv('/content/df_1.csv')

CPU times: user 32.6 s, sys: 17.2 s, total: 49.9 s
Wall time: 51.6 s


In [4]:
# Reduce the number of rows
df = df.sample(frac=0.3, random_state=42)

# Check the shape of the reduced DataFrame
df.shape


(177162, 534)

In [5]:
df.loc[:, 'isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,170821
1,6341


In [23]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score


## Train-Test Split

Split the dataset into train set and test set. Train set will be used to train the model. Test set will be used to check the performance of model

In [25]:
# Split the dataset into features (X) and target (y)
X = df.drop(['isFraud'], axis=1)
y = df['isFraud'].astype(bool)

# Replace infinite values with NaN
X = X.replace([np.inf, -np.inf], np.nan)

# Impute missing values (use mean for imputation)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.30, random_state=42)

# Check the shapes of the splits
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(124013, 533) (124013,)
(53149, 533) (53149,)


## Handling Missing Values - Imputation

In [10]:
from sklearn.impute import SimpleImputer

# replace inf
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Impute
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_train_imputed.head()

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_19,id_20,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,card2_missing_flag,card3_missing_flag,card4_missing_flag,card5_missing_flag,card6_missing_flag,addr1_missing_flag,addr2_missing_flag,dist1_missing_flag,dist2_missing_flag,P_emaildomain_missing_flag,R_emaildomain_missing_flag,D1_missing_flag,D2_missing_flag,D3_missing_flag,D4_missing_flag,D5_missing_flag,D6_missing_flag,D7_missing_flag,D8_missing_flag,D9_missing_flag,D10_missing_flag,D11_missing_flag,D12_missing_flag,D13_missing_flag,D14_missing_flag,D15_missing_flag,M1_missing_flag,M2_missing_flag,M3_missing_flag,M4_missing_flag,M5_missing_flag,M6_missing_flag,M7_missing_flag,M8_missing_flag,M9_missing_flag,V1_missing_flag,V2_missing_flag,V3_missing_flag,V4_missing_flag,V5_missing_flag,V6_missing_flag,V7_missing_flag,V8_missing_flag,V9_missing_flag,V10_missing_flag,V11_missing_flag,V12_missing_flag,V13_missing_flag,V14_missing_flag,V15_missing_flag,V16_missing_flag,V17_missing_flag,V18_missing_flag,V19_missing_flag,V20_missing_flag,V21_missing_flag,V22_missing_flag,V23_missing_flag,V24_missing_flag,V25_missing_flag,V26_missing_flag,V27_missing_flag,V28_missing_flag,V29_missing_flag,V30_missing_flag,V31_missing_flag,V32_missing_flag,V33_missing_flag,V34_missing_flag,V35_missing_flag,V36_missing_flag,V37_missing_flag,V38_missing_flag,V39_missing_flag,V40_missing_flag,V41_missing_flag,V42_missing_flag,V43_missing_flag,V44_missing_flag,V45_missing_flag,V46_missing_flag,V47_missing_flag,V48_missing_flag,V49_missing_flag,V50_missing_flag,V51_missing_flag,V52_missing_flag,V53_missing_flag,V54_missing_flag,V55_missing_flag,V56_missing_flag,V57_missing_flag,V58_missing_flag,V59_missing_flag,V60_missing_flag,V61_missing_flag,V62_missing_flag,V63_missing_flag,V64_missing_flag,V65_missing_flag,V66_missing_flag,V67_missing_flag,V68_missing_flag,V69_missing_flag,V70_missing_flag,V71_missing_flag,V72_missing_flag,V73_missing_flag,V74_missing_flag,V75_missing_flag,V76_missing_flag,V77_missing_flag,V78_missing_flag,V79_missing_flag,V80_missing_flag,V81_missing_flag,V82_missing_flag,V83_missing_flag,V84_missing_flag,V85_missing_flag,V86_missing_flag,V87_missing_flag,V88_missing_flag,V89_missing_flag,V90_missing_flag,V91_missing_flag,V92_missing_flag,V93_missing_flag,V94_missing_flag,V95_missing_flag,V96_missing_flag,V97_missing_flag,V98_missing_flag,V99_missing_flag,V100_missing_flag,V101_missing_flag,V102_missing_flag,V103_missing_flag,V104_missing_flag,V105_missing_flag,V106_missing_flag,V107_missing_flag,V108_missing_flag,V109_missing_flag,V110_missing_flag,V111_missing_flag,V112_missing_flag,V113_missing_flag,V114_missing_flag,V115_missing_flag,V116_missing_flag,V117_missing_flag,V118_missing_flag,V119_missing_flag,V120_missing_flag,V121_missing_flag,V122_missing_flag,V123_missing_flag,V124_missing_flag,V125_missing_flag,V126_missing_flag,V127_missing_flag,V128_missing_flag,V129_missing_flag,V130_missing_flag,V131_missing_flag,V132_missing_flag,V133_missing_flag,V134_missing_flag,V135_missing_flag,...,V169_missing_flag,V170_missing_flag,V171_missing_flag,V172_missing_flag,V173_missing_flag,V174_missing_flag,V175_missing_flag,V176_missing_flag,V177_missing_flag,V178_missing_flag,V179_missing_flag,V180_missing_flag,V181_missing_flag,V182_missing_flag,V183_missing_flag,V184_missing_flag,V185_missing_flag,V186_missing_flag,V187_missing_flag,V188_missing_flag,V189_missing_flag,V190_missing_flag,V191_missing_flag,V192_missing_flag,V193_missing_flag,V194_missing_flag,V195_missing_flag,V196_missing_flag,V197_missing_flag,V198_missing_flag,V199_missing_flag,V200_missing_flag,V201_missing_flag,V202_missing_flag,V203_missing_flag,V204_missing_flag,V205_missing_flag,V206_missing_flag,V207_missing_flag,V208_missing_flag,V209_missing_flag,V210_missing_flag,V211_missing_flag,V212_missing_flag,V213_missing_flag,V214_missing_flag,V215_missing_flag,V216_missing_flag,V217_missing_flag,V218_missing_flag,V219_missing_flag,V220_missing_flag,V221_missing_flag,V222_missing_flag,V223_missing_flag,V224_missing_flag,V225_missing_flag,V226_missing_flag,V227_missing_flag,V228_missing_flag,V229_missing_flag,V230_missing_flag,V231_missing_flag,V232_missing_flag,V233_missing_flag,V234_missing_flag,V235_missing_flag,V236_missing_flag,V237_missing_flag,V238_missing_flag,V239_missing_flag,V240_missing_flag,V241_missing_flag,V242_missing_flag,V243_missing_flag,V244_missing_flag,V245_missing_flag,V246_missing_flag,V247_missing_flag,V248_missing_flag,V249_missing_flag,V250_missing_flag,V251_missing_flag,V252_missing_flag,V253_missing_flag,V254_missing_flag,V255_missing_flag,V256_missing_flag,V257_missing_flag,V258_missing_flag,V259_missing_flag,V260_missing_flag,V261_missing_flag,V262_missing_flag,V263_missing_flag,V264_missing_flag,V265_missing_flag,V266_missing_flag,V267_missing_flag,V268_missing_flag,V269_missing_flag,V270_missing_flag,V271_missing_flag,V272_missing_flag,V273_missing_flag,V274_missing_flag,V275_missing_flag,V276_missing_flag,V277_missing_flag,V278_missing_flag,V279_missing_flag,V280_missing_flag,V281_missing_flag,V282_missing_flag,V283_missing_flag,V284_missing_flag,V285_missing_flag,V286_missing_flag,V287_missing_flag,V288_missing_flag,V289_missing_flag,V290_missing_flag,V291_missing_flag,V292_missing_flag,V293_missing_flag,V294_missing_flag,V295_missing_flag,V296_missing_flag,V297_missing_flag,V298_missing_flag,V299_missing_flag,V300_missing_flag,V301_missing_flag,V302_missing_flag,V303_missing_flag,V304_missing_flag,V305_missing_flag,V306_missing_flag,V307_missing_flag,V308_missing_flag,V309_missing_flag,V310_missing_flag,V311_missing_flag,V312_missing_flag,V313_missing_flag,V314_missing_flag,V315_missing_flag,V316_missing_flag,V317_missing_flag,V318_missing_flag,V319_missing_flag,V320_missing_flag,V321_missing_flag,V322_missing_flag,V323_missing_flag,V324_missing_flag,V325_missing_flag,V326_missing_flag,V327_missing_flag,V328_missing_flag,V329_missing_flag,V330_missing_flag,V331_missing_flag,V332_missing_flag,V333_missing_flag,V334_missing_flag,V335_missing_flag,V336_missing_flag,V337_missing_flag,V338_missing_flag,V339_missing_flag,id_01_missing_flag,id_02_missing_flag,id_03_missing_flag,id_04_missing_flag,id_05_missing_flag,id_06_missing_flag,id_07_missing_flag,id_08_missing_flag,id_09_missing_flag,id_10_missing_flag,id_11_missing_flag,id_12_missing_flag,id_13_missing_flag,id_14_missing_flag,id_15_missing_flag,id_16_missing_flag,id_17_missing_flag,id_18_missing_flag,id_19_missing_flag,id_20_missing_flag,id_21_missing_flag,id_22_missing_flag,id_23_missing_flag,id_24_missing_flag,id_25_missing_flag,id_26_missing_flag,id_27_missing_flag,id_28_missing_flag,id_29_missing_flag,id_30_missing_flag,id_31_missing_flag,id_32_missing_flag,id_33_missing_flag,id_34_missing_flag,id_35_missing_flag,id_36_missing_flag,id_37_missing_flag,id_38_missing_flag,DeviceType_missing_flag,DeviceInfo_missing_flag,_Weekdays,_Hours,_Days,Trans_min_mean,Trans_min_std,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,PCA_V_0,PCA_V_1,PCA_V_2,PCA_V_3,PCA_V_4,PCA_V_5,PCA_V_6,PCA_V_7,PCA_V_8,PCA_V_9,PCA_V_10,PCA_V_11,PCA_V_12,PCA_V_13,PCA_V_14,PCA_V_15,PCA_V_16,PCA_V_17,PCA_V_18,PCA_V_19,PCA_V_20,PCA_V_21,PCA_V_22,PCA_V_23,PCA_V_24,PCA_V_25,PCA_V_26,PCA_V_27,PCA_V_28,PCA_V_29
0,4.76,4.0,7919.0,194.0,150.0,2.0,202.0,2.0,387.0,87.0,6.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,1.0,63.0,63.0,28.0,0.0,41.961279,69.479862,145.286203,0.562265,63.0,63.0,53.154347,17.830537,58.533532,63.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,2.0,2.0,-10.153572,173922.142046,0.063445,-0.056703,1.657035,-6.714579,0.094609,-0.28043,99.747073,1.0,48.006036,-342.530747,1.0,1.0,189.280564,352.1287,404.492715,1.0,1.0,3.0,4.0,26.536931,0.011545,4.0,2.0,2.0,2.0,2.0,2.0,0.041325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,19.0,4.0,-18.03,-0.0754,1.156,0.884,1.001445,0.4607,-0.801,0.3171,0.273,-0.02667,0.0447,-0.007526,-0.0366,-0.2163,0.0177,0.032,-0.001159,-0.0214,0.02342,-0.05185,-0.006638,0.00409,-0.000468,0.002995,-0.005817,0.000197,-0.006874,0.009026,-0.0082,0.003115,0.002178,-0.001935,0.009346,0.004696,-0.01857,0.002054
1,4.605,3.0,12866.0,303.0,150.0,4.0,226.0,2.0,330.0,87.0,116.886968,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,169.952345,28.283617,140.308033,41.961279,0.0,145.286203,0.562265,385.0,148.566535,53.154347,0.0,0.0,0.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,-5.0,3359.0,0.063445,-0.056703,0.0,0.0,0.094609,-0.28043,100.0,2.0,52.0,-300.0,2.0,2.0,166.0,327.0,533.0,2.0,2.0,4.0,0.0,24.0,0.001788,3.0,1.0,0.0,1.0,1.0,0.0,0.0808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,14.0,31.0,-35.03,-0.1465,0.913,0.751,0.522975,0.4382,1.479,-0.2037,-0.3691,0.7285,-0.532,-0.03802,0.5034,-0.2473,-1.045,-0.10266,-0.0527,0.04633,0.0872,-0.04373,-0.0905,-0.0959,0.014915,-0.02176,-0.05396,-0.03485,0.0225,0.05435,-0.01646,0.01248,0.002043,-0.05014,-0.02235,-0.0415,0.03348,-0.01677
2,3.256,4.0,12783.0,501.0,150.0,4.0,226.0,2.0,143.0,87.0,116.886968,2.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,169.952345,28.283617,140.308033,41.961279,69.479862,145.286203,0.562265,0.0,148.566535,53.154347,17.830537,58.533532,0.0,2.0,2.0,2.0,3.0,2.0,0.0,2.0,2.0,2.0,-10.153572,173922.142046,0.063445,-0.056703,1.657035,-6.714579,0.094609,-0.28043,99.747073,1.0,48.006036,-342.530747,1.0,1.0,189.280564,352.1287,404.492715,1.0,1.0,3.0,4.0,26.536931,0.011545,4.0,2.0,2.0,2.0,2.0,2.0,0.041325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,26.0,-109.06,-0.456,0.2769,0.195,0.219224,0.1137,0.471,-0.215,-1.222,-0.08856,-0.2793,0.0216,0.12213,0.1418,0.007343,-0.0463,-0.01707,-0.0304,0.05597,-0.007717,-0.10455,0.02138,-0.008644,-0.004673,0.0234,-0.04147,-0.03577,0.0757,-0.0422,0.02635,0.01199,0.02832,0.01308,0.0697,-0.012794,-0.012985
3,4.195,0.0,2933.0,555.0,162.0,4.0,166.0,2.0,290.646858,86.808377,116.886968,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,169.952345,28.283617,0.0,41.961279,0.0,95.9,0.875,0.0,148.566535,0.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0,-10.0,761923.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,2.0,27.0,-342.530747,0.0,0.0,225.0,266.0,305.0,0.0,0.0,3.0,7.0,26.536931,0.011545,4.0,0.0,0.0,1.0,0.0,0.0,0.041325,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.0,21.0,-68.6,-0.287,2.143,0.4985,3.188963,0.2908,-0.014786,-1.217,0.05408,0.652,0.1171,-0.01779,-0.145,0.03687,-0.013145,0.02553,-0.00976,-0.05774,-0.01115,-0.0254,-0.004864,-0.001329,-0.002531,0.00693,0.00916,0.016,0.02042,-0.00958,-0.04892,-0.02876,-0.02602,0.010155,0.009224,0.001772,0.01733,0.002565
4,4.06,4.0,2898.0,127.0,150.0,2.0,117.0,2.0,330.0,87.0,2.0,4.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,9.0,2.0,90.0,90.0,54.0,506.0,54.0,69.479862,145.286203,0.562265,506.0,454.0,53.154347,17.830537,58.533532,506.0,1.0,1.0,1.0,3.0,2.0,1.0,0.0,0.0,1.0,-10.153572,173922.142046,0.063445,-0.056703,1.657035,-6.714579,0.094609,-0.28043,99.747073,1.0,48.006036,-342.530747,1.0,1.0,189.280564,352.1287,404.492715,1.0,1.0,3.0,4.0,26.536931,0.011545,4.0,2.0,2.0,2.0,2.0,2.0,0.041325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,23.0,-77.06,-0.3223,0.4128,0.4377,0.207005,0.2281,-0.809,0.3738,0.2461,-0.001898,0.02486,-0.004303,-0.008934,0.3132,0.0169,-0.1499,-0.03778,0.03165,-0.01442,-0.01405,0.03262,-0.04425,-0.03403,0.0841,-0.05252,0.0704,0.028,0.02367,0.009186,-0.00827,-0.002535,-0.02626,-0.004776,-0.00305,0.001935,-0.00421


# CatBoost Algo

In [26]:
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=1000,
                           depth=6,
                           learning_rate=0.1,
                           loss_function='Logloss',
                           cat_features=[],  # If you have categorical features, specify their indices here
                           random_state=42,
                           verbose=200)

# Train the model
model.fit(X_train, y_train)


0:	learn: 0.5426656	total: 284ms	remaining: 4m 43s
200:	learn: 0.0824913	total: 29.6s	remaining: 1m 57s
400:	learn: 0.0712766	total: 59s	remaining: 1m 28s
600:	learn: 0.0633995	total: 1m 26s	remaining: 57.3s
800:	learn: 0.0568479	total: 2m 2s	remaining: 30.4s
999:	learn: 0.0515801	total: 2m 33s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78d1ac1a95a0>

# Step 5: Evaluate the Model

In [27]:
# Predict the probabilities for the test set
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"AUC on test set: {auc_score}")


AUC on test set: 0.9218543433448634


# Step 6: Hyperparameter Tuning

In [28]:
import optuna

def objective(trial):
    # Hyperparameters to tune
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'loss_function': 'Logloss',
        'cat_features': [],  # Specify categorical feature indices if any
        'random_state': 42,
        'verbose': 0
    }

    # Initialize the model with trial parameters
    model = CatBoostClassifier(**params)

    # Train the model
    model.fit(X_train, y_train)

    # Get predictions and calculate AUC score
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_prob)

    return auc_score

# Create Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Print best hyperparameters
print("Best hyperparameters: ", study.best_params)


[I 2024-12-05 15:58:28,414] A new study created in memory with name: no-name-d50f075c-102d-42cd-bbf3-ab54202b3d0b
[I 2024-12-05 15:59:39,511] Trial 0 finished with value: 0.8942746147821231 and parameters: {'iterations': 707, 'depth': 4, 'learning_rate': 0.028520573946700063}. Best is trial 0 with value: 0.8942746147821231.
[I 2024-12-05 16:08:54,870] Trial 1 finished with value: 0.9151597348914069 and parameters: {'iterations': 1875, 'depth': 8, 'learning_rate': 0.011019379362855639}. Best is trial 1 with value: 0.9151597348914069.
[I 2024-12-05 16:17:18,842] Trial 2 finished with value: 0.9282874121977409 and parameters: {'iterations': 1746, 'depth': 8, 'learning_rate': 0.09284557233180082}. Best is trial 2 with value: 0.9282874121977409.
[I 2024-12-05 16:18:25,514] Trial 3 finished with value: 0.8913904552633495 and parameters: {'iterations': 538, 'depth': 5, 'learning_rate': 0.02017691551429351}. Best is trial 2 with value: 0.9282874121977409.
[I 2024-12-05 16:24:39,173] Trial 4 fi

Best hyperparameters:  {'iterations': 1361, 'depth': 8, 'learning_rate': 0.06586730464888}


# Step 7: Re-train with Best Hyperparameters

In [29]:
# Get the best hyperparameters from the Optuna study
best_params = study.best_params

# Train the final CatBoost model using the best hyperparameters
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_prob = final_model.predict_proba(X_test)[:, 1]
final_auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"Final AUC on test set: {final_auc_score}")


0:	learn: 0.5832040	total: 338ms	remaining: 7m 39s
1:	learn: 0.4959109	total: 579ms	remaining: 6m 33s
2:	learn: 0.4258784	total: 796ms	remaining: 6m
3:	learn: 0.3700627	total: 1s	remaining: 5m 39s
4:	learn: 0.3246457	total: 1.24s	remaining: 5m 35s
5:	learn: 0.2880101	total: 1.46s	remaining: 5m 30s
6:	learn: 0.2603426	total: 1.68s	remaining: 5m 25s
7:	learn: 0.2351943	total: 1.92s	remaining: 5m 24s
8:	learn: 0.2130271	total: 2.15s	remaining: 5m 23s
9:	learn: 0.1955394	total: 2.41s	remaining: 5m 25s
10:	learn: 0.1817719	total: 2.65s	remaining: 5m 25s
11:	learn: 0.1706343	total: 2.89s	remaining: 5m 24s
12:	learn: 0.1608869	total: 3.11s	remaining: 5m 22s
13:	learn: 0.1525036	total: 3.35s	remaining: 5m 22s
14:	learn: 0.1445034	total: 3.59s	remaining: 5m 22s
15:	learn: 0.1384170	total: 3.83s	remaining: 5m 22s
16:	learn: 0.1335784	total: 4.08s	remaining: 5m 22s
17:	learn: 0.1298820	total: 4.32s	remaining: 5m 22s
18:	learn: 0.1266462	total: 4.53s	remaining: 5m 19s
19:	learn: 0.1231663	total: 4

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Make predictions
y_pred = final_model.predict(X_test)
y_pred_prob = final_model.predict_proba(X_test)[:, 1]

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# AUC-ROC Score
auc_roc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC-ROC Score: {auc_roc:.4f}")


Accuracy: 0.9778
Precision: 0.9093
Recall: 0.4409
F1 Score: 0.5939
AUC-ROC Score: 0.9288
