In [1]:
# Importing all the dependencies
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


In [2]:
# Load data
file_paths = [
    "2018-04-01.pkl", "2018-04-02.pkl", "2018-04-03.pkl", 
    "2018-04-04.pkl",  "2018-04-06.pkl", "2018-04-07.pkl", 
    "2018-04-08.pkl", "2018-04-09.pkl", "2018-04-10.pkl", 
]
dfs = [pd.read_pickle(file) for file in file_paths]
df = pd.concat(dfs, ignore_index=True)

In [3]:
# Preprocessing
df['TX_TIME_SECONDS'] = pd.to_numeric(df['TX_TIME_SECONDS'], errors='coerce')
df['TX_TIME_DAYS'] = pd.to_numeric(df['TX_TIME_DAYS'], errors='coerce')

In [4]:
# Encode CUSTOMER_ID and TERMINAL_ID
le_customer = LabelEncoder()
le_terminal = LabelEncoder()
df['CUSTOMER_ID'] = le_customer.fit_transform(df['CUSTOMER_ID'])
df['TERMINAL_ID'] = le_terminal.fit_transform(df['TERMINAL_ID'])

In [5]:
# Define features and target
features = ['CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS']
X = df[features]
y = df['TX_FRAUD']

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [7]:
# Model training and Hyperparameter tuning with RandomForestClassifier
rf_model  = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

In [8]:
# Normalize the data (optional, especially for certain models like SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Evaluation
y_pred = rf_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     17183
           1       0.00      0.00      0.00        50

    accuracy                           1.00     17233
   macro avg       0.50      0.50      0.50     17233
weighted avg       0.99      1.00      1.00     17233

Confusion Matrix:
 [[17183     0]
 [   50     0]]
ROC AUC Score: 0.5


In [10]:
from sklearn.metrics import accuracy_score

# Predict on test set
rf_predictions = rf_model.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")




Random Forest Accuracy: 0.9971


In [11]:
# 2. XGBoost Hyperparameter Tuning with RandomizedSearchCV
xgb_model = XGBClassifier(random_state=42)

In [12]:
# Define hyperparameters distribution
xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

In [13]:
# RandomizedSearchCV for XGBoost
xgb_random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42)
xgb_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [14]:
# Print best parameters and results
print(f"Best parameters for XGBoost: {xgb_random_search.best_params_}")
print(f"Best cross-validation score: {xgb_random_search.best_score_}")

Best parameters for XGBoost: {'subsample': 0.7, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.7}
Best cross-validation score: 0.9971130552920908


In [15]:
# Evaluate on the test set
xgb_best_model = xgb_random_search.best_estimator_
xgb_predictions = xgb_best_model.predict(X_test)
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_predictions))

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     17183
           1       0.00      0.00      0.00        50

    accuracy                           1.00     17233
   macro avg       0.50      0.50      0.50     17233
weighted avg       0.99      1.00      1.00     17233



In [16]:
# Predict on test set
xgb_predictions = xgb_best_model.predict(X_test)

# Calculate accuracy
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")


XGBoost Accuracy: 0.9970
