In [1]:
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# 1. Load data

In [2]:
df = pd.read_csv('data/fraud_data.csv')
df.head()

Unnamed: 0,Profession,Income,Credit_card_number,Expiry,Security_code,Fraud
0,DOCTOR,42509,3515418493460774,07/25,251,1
1,DOCTOR,80334,213134223583196,05/32,858,1
2,LAWYER,91552,4869615013764888,03/30,755,1
3,LAWYER,43623,341063356109385,01/29,160,1
4,DOCTOR,22962,4707418777543978402,11/30,102,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Profession          10000 non-null  object
 1   Income              10000 non-null  int64 
 2   Credit_card_number  10000 non-null  int64 
 3   Expiry              10000 non-null  object
 4   Security_code       10000 non-null  int64 
 5   Fraud               10000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 468.9+ KB


In [4]:
df.isnull().sum()

Profession            0
Income                0
Credit_card_number    0
Expiry                0
Security_code         0
Fraud                 0
dtype: int64

# 2. Exploratory Data Analysis

In [5]:
# Get a copy of the dataframe
df_2 = df.copy()

In [6]:
# Get year expiry
def expire_year(date, current_year=24):
    year = date.split('/')[-1]
    return int(year) - current_year

df_2['Expire_year'] = df_2['Expiry'].apply(expire_year)

In [7]:
# Get one hot encoding for the profession column
df_2 = pd.get_dummies(df_2, columns=['Profession'])

In [8]:
# Drop the Credit_card_number and Expiry columns
df_2.drop(['Credit_card_number', 'Expiry'], axis=1, inplace=True)

In [9]:
df_2.head()

Unnamed: 0,Income,Security_code,Fraud,Expire_year,Profession_DOCTOR,Profession_ENGINEER,Profession_LAWYER
0,42509,251,1,1,True,False,False
1,80334,858,1,8,True,False,False
2,91552,755,1,6,False,False,True
3,43623,160,1,5,False,False,True
4,22962,102,0,6,True,False,False


# 3. Training models

In [10]:
# Split the data into features and target
X = df_2.drop('Fraud', axis=1)
y = df_2['Fraud']

# Split the data into training and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [11]:
X_train.shape, X_val.shape, X_test.shape

((7200, 6), (1800, 6), (1000, 6))

## 3.1. Logistic Regression

In [12]:
log_reg_model = LogisticRegression(random_state=42)
log_reg_params = {
    'penalty': [None, 'l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

log_reg_grid = GridSearchCV(log_reg_model, log_reg_params, cv=5, verbose=1)
log_reg_grid.fit(X_train, y_train)

log_reg_best_model = log_reg_grid.best_estimator_
print(f'Logistic Regression best model: {log_reg_best_model}')

log_reg_pred = log_reg_best_model.predict(X_val)
log_reg_accuracy = accuracy_score(y_val, log_reg_pred)
log_reg_report = classification_report(y_val, log_reg_pred, target_names=['Not Fraud', 'Fraud'])
print(f'Logistic Regression accuracy: {log_reg_accuracy}')
print(f'Logistic Regression report: {log_reg_report}')

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Logistic Regression best model: LogisticRegression(C=0.1, random_state=42)
Logistic Regression accuracy: 0.4872222222222222
Logistic Regression report:               precision    recall  f1-score   support

   Not Fraud       0.50      0.38      0.43       919
       Fraud       0.48      0.60      0.53       881

    accuracy                           0.49      1800
   macro avg       0.49      0.49      0.48      1800
weighted avg       0.49      0.49      0.48      1800



## 3.2. SVM

In [14]:
svm_model = SVC()
svm_params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

svm_grid = GridSearchCV(svm_model, svm_params, cv=5, verbose=1)
svm_grid.fit(X_train, y_train)

svm_best_model = svm_grid.best_estimator_
print(f'SVM best model: {svm_best_model}')

svm_pred = svm_best_model.predict(X_val)
svm_accuracy = accuracy_score(y_val, svm_pred)
svm_report = classification_report(y_val, svm_pred, target_names=['Not Fraud', 'Fraud'])
print(f'SVM accuracy: {svm_accuracy}')
print(f'SVM report: {svm_report}')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
SVM best model: SVC(C=10, kernel='linear')
SVM accuracy: 0.4988888888888889
SVM report:               precision    recall  f1-score   support

   Not Fraud       0.51      0.77      0.61       919
       Fraud       0.47      0.21      0.29       881

    accuracy                           0.50      1800
   macro avg       0.49      0.49      0.45      1800
weighted avg       0.49      0.50      0.46      1800



## 3.3. Random Forest

In [12]:
rf_model = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 5]
}

rf_grid = GridSearchCV(rf_model, rf_params, cv=5, verbose=1)
rf_grid.fit(X_train, y_train)

rf_best_model = rf_grid.best_estimator_
print(f'Random Forest best model: {rf_best_model}')

rf_pred = rf_best_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_pred)
rf_report = classification_report(y_val, rf_pred, target_names=['Not Fraud', 'Fraud'])
print(f'Random Forest accuracy: {rf_accuracy}')
print(f'Random Forest classification report: {rf_report}')

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Random Forest best model: RandomForestClassifier(max_depth=10, min_samples_leaf=2, random_state=42)
Random Forest accuracy: 0.49944444444444447
Random Forest classification report: {'Not Fraud': {'precision': 0.5107913669064749, 'recall': 0.46354733405875953, 'f1-score': 0.4860239589275528, 'support': 919.0}, 'Fraud': {'precision': 0.489648033126294, 'recall': 0.5368898978433598, 'f1-score': 0.5121819166215484, 'support': 881.0}, 'accuracy': 0.49944444444444447, 'macro avg': {'precision': 0.5002197000163844, 'recall': 0.5002186159510597, 'f1-score': 0.4991029377745506, 'support': 1800.0}, 'weighted avg': {'precision': 0.5004428796507308, 'recall': 0.49944444444444447, 'f1-score': 0.4988268259988918, 'support': 1800.0}}


## 3.4. XGBoost

In [13]:
xgb_model = XGBClassifier(eval_metric='mlogloss')
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 8],
    'learning_rate': [0.1, 0.01, 0.001],
}

xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, verbose=1)
xgb_grid.fit(X_train, y_train)

xgb_best_model = xgb_grid.best_estimator_
print(f'XGBoost best model: {xgb_best_model}')

xgb_pred = xgb_best_model.predict(X_val)
xgb_accuracy = accuracy_score(y_val, xgb_pred)
xgb_report = classification_report(y_val, xgb_pred, target_names=['Not Fraud', 'Fraud'])
print(f'XGBoost accuracy: {xgb_accuracy}')
print(f'XGBoost classification report: {xgb_report}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

XGBoost best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=8,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=300,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)
XGBoost accuracy: 0.5077777777777778
XGBoost classification report: {'Not Fraud': {'precision': 0.5179542981501633, 'recall': 0.5179542981501633, 'f1-score': 0.5179542981501633, 'support': 919.0}, 'Fraud': {'precision': 0

## 3.5. LightGBM

In [14]:
lgb_model = LGBMClassifier()
lgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 8],
    'learning_rate': [0.1, 0.01, 0.001],
}

lgb_grid = GridSearchCV(lgb_model, lgb_params, cv=5, verbose=1)
lgb_grid.fit(X_train, y_train)

lgb_best_model = lgb_grid.best_estimator_
print(f'LightGBM best model: {lgb_best_model}')

lgb_pred = lgb_best_model.predict(X_val)
lgb_accuracy = accuracy_score(y_val, lgb_pred)
lgb_report = classification_report(y_val, lgb_pred, target_names=['Not Fraud', 'Fraud'])
print(f'LightGBM accuracy: {lgb_accuracy}')
print(f'LightGBM classification report: {lgb_report}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Number of positive: 2898, number of negative: 2862
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 527
[LightGBM] [Info] Number of data points in the train set: 5760, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503125 -> initscore=0.012500
[LightGBM] [Info] Start training from score 0.012500
[LightGBM] [Info] Number of positive: 2898, number of negative: 2862
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 527
[LightGBM] [Info] Number of data points in the train set: 5

## 3.6. CatBoost

In [15]:
cat_model = CatBoostClassifier(silent=True)    # Set silent=True to suppress output during training
cat_params = {
    'iterations': [100, 200, 300],
    'depth': [3, 5, 8],
    'learning_rate': [0.1, 0.01, 0.001],
}

cat_grid = GridSearchCV(cat_model, cat_params, cv=5, verbose=1)
cat_grid.fit(X_train, y_train)

cat_best_model = cat_grid.best_estimator_
print(f'CatBoost best model: {cat_best_model}')

cat_pred = cat_best_model.predict(X_val)
cat_accuracy = accuracy_score(y_val, cat_pred)
cat_report = classification_report(y_val, cat_pred, target_names=['Not Fraud', 'Fraud'])
print(f'CatBoost accuracy: {cat_accuracy}')
print(f'CatBoost classification report: {cat_report}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits
CatBoost best model: <catboost.core.CatBoostClassifier object at 0x000002083A2FB430>
CatBoost accuracy: 0.48277777777777775
CatBoost classification report: {'Not Fraud': {'precision': 0.49019607843137253, 'recall': 0.3264417845484222, 'f1-score': 0.3919007184846506, 'support': 919.0}, 'Fraud': {'precision': 0.47895622895622897, 'recall': 0.6458569807037458, 'f1-score': 0.5500241662638956, 'support': 881.0}, 'accuracy': 0.48277777777777775, 'macro avg': {'precision': 0.48457615369380075, 'recall': 0.486149382626084, 'f1-score': 0.4709624423742731, 'support': 1800.0}, 'weighted avg': {'precision': 0.4846947965493717, 'recall': 0.48277777777777775, 'f1-score': 0.4692933615366033, 'support': 1800.0}}


## 3.5. Best model

In [21]:
models = {
    'Logistic Regression': [log_reg_best_model, log_reg_accuracy],
    'SVM': [svm_best_model, svm_accuracy],
    'Random Forest': [rf_best_model, rf_accuracy],
    'XGBoost': [xgb_best_model, xgb_accuracy],
    'LightGBM': [lgb_best_model, lgb_accuracy],
    'CatBoost': [cat_best_model, cat_accuracy]
}

best_model_name = max(models, key=lambda model: models[model][1])
best_model_accuracy = models[best_model_name][1]
print(f'Best model: {best_model_name}')
print(f'Best model accuracy: {best_model_accuracy}')

Best model: XGBoost
Best model accuracy: 0.5077777777777778


# 4. Evaluation

In [22]:
# Reatin model with the whole training data
best_model = models[best_model_name][0]
best_model.fit(X_train_val, y_train_val)

# Predict on the test data
test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred)
test_report = classification_report(y_test, test_pred, target_names=['Not Fraud', 'Fraud'])

print(f'Test accuracy: {test_accuracy}')
print(f'Test classification report: {test_report}')

Parameters: { "use_label_encoder" } are not used.



Test accuracy: 0.496
Test classification report:               precision    recall  f1-score   support

   Not Fraud       0.48      0.47      0.47       487
       Fraud       0.51      0.52      0.52       513

    accuracy                           0.50      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.50      0.50      0.50      1000

