## Train Dataset  

In [1]:
import pandas as pd
import polars as pl

import gc
import sys
import os
sys.path.append(os.path.join(os.getcwd(),'..'))
from utils import reduce_mem_usage


train_identity = pd.read_parquet("../ieee-fraud-detection/train_identity.parquet").pipe(reduce_mem_usage)
train_transaction = pd.read_parquet("../ieee-fraud-detection/train_transaction.parquet").pipe(reduce_mem_usage)
gc.collect()

Memory usage after optimization is: 25.86 MB
Decreased by 42.7%
Memory usage after optimization is: 542.35 MB
Decreased by 69.4%


0

## Merging two of the dataset together

In [2]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
del train_identity, train_transaction
gc.collect()
train.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [3]:
train_id = train['TransactionID']
y = train['isFraud']
X = train.drop(['isFraud', 'TransactionID'], axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float16(354), float32(45), int16(1), int32(2), int8(1), object(31)
memory usage: 646.0+ MB


# Numerical Variables

# Train 

In [5]:
numerical_features = [feature for feature in X.columns if X[feature].dtype != pl.String]
categorical_features = [feature for feature in X.columns if train[feature].dtype == pl.String]

print('Number of numerical variables: ', len(numerical_features))
print('Number of categorical variables: ', len(categorical_features))

# visualise the numerical variables
train[numerical_features].head()
gc.collect()

Number of numerical variables:  401
Number of categorical variables:  31


304

In [6]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA


# numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

# Update the categorical pipeline to use OrdinalEncoder
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# full pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
    ]
)

pca_preprocessor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA())
])

pca_preprocessor

##  Test


In [7]:
test_identity = pd.read_parquet("../ieee-fraud-detection/test_identity.parquet").pipe(reduce_mem_usage)
test_transaction = pd.read_parquet("../ieee-fraud-detection/test_transaction.parquet").pipe(reduce_mem_usage)
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
del test_transaction, test_identity
gc.collect()

Memory usage after optimization is: 25.44 MB
Decreased by 42.7%
Memory usage after optimization is: 472.59 MB
Decreased by 68.9%


0

In [8]:
test = test.rename(columns={"id-01": "id_01", "id-02": "id_02", "id-03": "id_03", 
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04", 
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09", 
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12", 
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13", 
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18", 
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19", 
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24", 
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25", 
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30", 
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33", 
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36", 
                            "id-37": "id_37", "id-38": "id_38"})
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [9]:
test_id = test['TransactionID']
X_test = test.drop(['TransactionID'], axis=1)

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=20, stratify=y)

# Model Creation

In [11]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from catboost import CatBoostClassifier

# Define the pipeline
clf = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', CatBoostClassifier(iterations=500,
                            learning_rate=0.1,
                            loss_function='Logloss',
                            eval_metric='F1',
                            random_seed=42,
                            verbose=100))
])
clf.fit(x_train, y_train)

0:	learn: 0.8232916	total: 634ms	remaining: 5m 16s
100:	learn: 0.9682510	total: 41.6s	remaining: 2m 44s
200:	learn: 0.9779339	total: 1m 20s	remaining: 1m 59s
300:	learn: 0.9815361	total: 2m 2s	remaining: 1m 20s
400:	learn: 0.9834943	total: 2m 40s	remaining: 39.6s
499:	learn: 0.9848410	total: 3m 19s	remaining: 0us


In [12]:
# Evaluate Classification
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[113656    319]
 [  2256   1877]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.85      0.45      0.59      4133

    accuracy                           0.98    118108
   macro avg       0.92      0.73      0.79    118108
weighted avg       0.98      0.98      0.97    118108

