## Performing Fraud Detection analysis

In [1]:
# Standard imports for data manipulation

import numpy as np
import pandas as pd
import json
import gc
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, f1_score

import xgboost as xgb
import lightgbm as lgb


print("Files available in this notebook ===> ")

DATASET_PATH = '../data/raw/transaction_kaggle_dataset/'

for dirname, _, filenames in os.walk(DATASET_PATH):
  for filename in filenames:
    print("dirname ==> \n", dirname)
    print("filename ==> \n", filename)
    print(os.path.join(dirname, filename))


print("\nLoading data files...")
try:
  # Load fraud labels from JSON
  file_path_fraud_labels = DATASET_PATH + 'train_fraud_labels.json'

  with open(file_path_fraud_labels, 'r') as file:

    if os.path.getsize(file_path_fraud_labels) > 0:
      raw_json_data = json.load(file)
    else:
      # Handle the empty file case, e.g., by providing a default value
      raw_json_data = {'target': {}}
      print(f"Warning: The file '{file_path_fraud_labels}' is empty. Using default data.")

  transaction_labels_dict = raw_json_data['target']
  train_fraud_labels = pd.Series(transaction_labels_dict).reset_index()
  train_fraud_labels.columns = ['transaction_id', 'is_fraud']
  train_fraud_labels['transaction_id'] = pd.to_numeric(train_fraud_labels['transaction_id'])

  # Load other data files
  transaction_df = pd.read_csv(DATASET_PATH + 'transactions_data.csv')
  card_df = pd.read_csv(DATASET_PATH + 'cards_data.csv')
  users_df = pd.read_csv(DATASET_PATH + "users_data.csv")
  mcc_series = pd.read_json(DATASET_PATH + 'mcc_codes.json', typ='series')
  mcc_df = mcc_series.reset_index()
  mcc_df.columns = ['mcc_code', 'description']

  print("\nAll data files loaded successfully.")

except FileNotFoundError:
  print("\nERROR: FileNotFoundError. Please ensure the dataset is attached to the notebook (using the '+ Add data' button on the right) and the DATASET_PATH is correct.")

except json.JSONDecodeError as e:
  print(f"JSON decoding failed: {e}")

Files available in this notebook ===> 
dirname ==> 
 ../data/raw/transaction_kaggle_dataset/
filename ==> 
 cards_data.csv
../data/raw/transaction_kaggle_dataset/cards_data.csv
dirname ==> 
 ../data/raw/transaction_kaggle_dataset/
filename ==> 
 mcc_codes.json
../data/raw/transaction_kaggle_dataset/mcc_codes.json
dirname ==> 
 ../data/raw/transaction_kaggle_dataset/
filename ==> 
 train_fraud_labels.json
../data/raw/transaction_kaggle_dataset/train_fraud_labels.json
dirname ==> 
 ../data/raw/transaction_kaggle_dataset/
filename ==> 
 transactions_data.csv
../data/raw/transaction_kaggle_dataset/transactions_data.csv
dirname ==> 
 ../data/raw/transaction_kaggle_dataset/
filename ==> 
 users_data.csv
../data/raw/transaction_kaggle_dataset/users_data.csv

Loading data files...

All data files loaded successfully.


In [2]:
# --- Merge all DataFrames into one comprehensive DataFrame ---

# Start with the main transactions data
df = pd.merge(transaction_df, train_fraud_labels, left_on='id', right_on='transaction_id', how='left')

# Merge with card data (card_df.id is the card_id)
# Using suffixes to handle potential duplicate column names (e.g., 'id' in both)
df = pd.merge(df, card_df, left_on='card_id', right_on='id', how='left', suffixes=('', '_card'))

# Merge with user data (users_df.id is the client_id)
# Using suffixes again for robustness
df = pd.merge(df, users_df, left_on='client_id', right_on='id', how='left', suffixes=('', '_user'))

# Merge with MCC descriptions (mcc_df.mcc_code is the mcc)
# Note: 'mcc' is the code in transaction_df, 'mcc_code' is the code in mcc_df
df = pd.merge(df, mcc_df, left_on='mcc', right_on='mcc_code', how='left')

# --- Clean up merged columns ---
# Drop redundant ID columns from the merges
# 'transaction_id' is redundant with 'id' (from transaction_df)
# 'id_card' is redundant with 'card_id'
# 'id_user' is redundant with 'client_id'
# 'mcc_code' is redundant with 'mcc'
df = df.drop(columns=['transaction_id', 'id_card', 'id_user', 'mcc_code'])

# Delete original dataframes to free up memory
del transaction_df, train_fraud_labels, card_df, users_df, mcc_df
gc.collect() # Manually trigger garbage collection

print("All data merged into a single DataFrame and original tables deleted.")
print(f"Shape of the final merged DataFrame: {df.shape}")

All data merged into a single DataFrame and original tables deleted.
Shape of the final merged DataFrame: (13305915, 39)


In [3]:
df.head(2)

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,...,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,description
0,7475327,2010-01-01 00:01:00,1556,2972,$-77.00,Swipe Transaction,59935,Beulah,ND,58523.0,...,Female,594 Mountain View Street,46.8,-100.76,$23679,$48277,$110153,740,4,Miscellaneous Food Stores
1,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,67570,Bettendorf,IA,52722.0,...,Male,604 Pine Street,40.8,-91.12,$18076,$36853,$112139,834,5,Department Stores


In [4]:
# Drop rows where the fraud label is missing (these are unlabeled transactions)
# This is a critical step as supervised learning requires labeled data.
df.dropna(subset=['is_fraud'], inplace=True)

# Convert the target variable 'is_fraud' to a numerical format (0 for 'No', 1 for 'Yes')
# This is required by most machine learning algorithms.
df['is_fraud'] = df['is_fraud'].map({'No': 0, 'Yes': 1})

# Define features (X) and target (y)
features = [col for col in df.columns if col != 'is_fraud']
X = df[features]
y = df['is_fraud']

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training (60%), and a temporary set (40%)
# The first split takes 40% for temp, leaving 60% for X_train.
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.40, random_state=42, stratify=y)

# Split the temporary set (which is 40% of original) into validation (20% of original) and test (20% of original)
# 0.50 of X_temp (40%) = 20% of original.
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

# Delete the large intermediate dataframes to save memory (important for Kaggle kernels)
del df, X, y, X_temp, y_temp
gc.collect() # Manually trigger garbage collection

print("Full dataset split into training, validation, and test sets.")
print(f"X_train shape: {X_train.shape}")
print(f"X_cv shape: {X_cv.shape}")
print(f"X_test shape: {X_test.shape}")

Full dataset split into training, validation, and test sets.
X_train shape: (5348977, 38)
X_cv shape: (1782993, 38)
X_test shape: (1782993, 38)


In [6]:
import numpy as np
import pandas as pd

def apply_preprocessing(df, is_training_set=False, median_imputations=None):
    """
    Takes a raw data split and applies all feature engineering steps.
    """
    df_processed = df.copy()

    # --- Step 1: Clean Numerical Columns ---
    amount_cols = ['amount', 'per_capita_income', 'yearly_income', 'credit_limit', 'total_debt']
    for col in amount_cols:
        if col in df_processed.columns:
            df_processed[col] = pd.to_numeric(df_processed[col].astype(str).str.replace(r'[$,]', '', regex=True), errors='coerce')

    # --- Step 2: Date Engineering ---
    date_cols = ['date', 'expires', 'acct_open_date']
    for col in date_cols:
        if col in df_processed.columns:
            df_processed[col] = pd.to_datetime(df_processed[col], errors='coerce', format='mixed')

    if 'date' in df_processed.columns:
        df_processed['hour_of_day'] = df_processed['date'].dt.hour
        df_processed['day_of_week'] = df_processed['date'].dt.dayofweek
        df_processed['month'] = df_processed['date'].dt.month
    if 'expires' in df_processed.columns and 'date' in df_processed.columns:
        df_processed['days_to_expiry'] = (df_processed['expires'] - df_processed['date']).dt.days

    # Drop original date columns right after use, as you suggested
    df_processed.drop(columns=date_cols, inplace=True)

    # --- Step 3: Cyclical Feature Creation ---
    cyclical_cols_original = ['hour_of_day', 'day_of_week', 'month']
    if all(col in df_processed.columns for col in cyclical_cols_original):
        df_processed['hour_sin'] = np.sin(2 * np.pi * df_processed['hour_of_day'] / 24.0)
        df_processed['hour_cos'] = np.cos(2 * np.pi * df_processed['hour_of_day'] / 24.0)
        df_processed['day_of_week_sin'] = np.sin(2 * np.pi * df_processed['day_of_week'] / 7.0)
        df_processed['day_of_week_cos'] = np.cos(2 * np.pi * df_processed['day_of_week'] / 7.0)
        df_processed['month_sin'] = np.sin(2 * np.pi * df_processed['month'] / 12.0)
        df_processed['month_cos'] = np.cos(2 * np.pi * df_processed['month'] / 12.0)

        # Drop original cyclical columns right after use, as you suggested
        df_processed.drop(columns=cyclical_cols_original, inplace=True)

    # --- Step 4: Process Binary and Other Features ---
    if 'errors' in df_processed.columns:
        df_processed['has_error'] = df_processed['errors'].notna().astype(int)
    if 'gender' in df_processed.columns:
        df_processed['gender'] = df_processed['gender'].map({'Female': 0, 'Male': 1})
    if 'has_chip' in df_processed.columns:
        df_processed['has_chip'] = df_processed['has_chip'].map({'NO': 0, 'YES': 1})

    # --- Step 5: Final NaN Imputation for numerical features (within this function) ---
    # This is done after all numerical features are created.
    # Medians will be calculated from X_train when this function is called on X_train.
    # Then passed to X_cv and X_test calls.
    numerical_cols_for_imputation = df_processed.select_dtypes(include=np.number).columns.tolist()
    if is_training_set:
        median_imputations = df_processed[numerical_cols_for_imputation].median()

    if median_imputations is not None:
        df_processed.fillna(median_imputations, inplace=True)

    return df_processed, median_imputations # Return processed DF and medians

print("Feature engineering function 'create_all_features' defined.")

Feature engineering function 'create_all_features' defined.


In [7]:
# --- Main preprocessing execution block ---
print("Step 1: Applying feature engineering to all data splits...")

# Initialize median_imputations_dict outside the loop; it will be populated by X_train's processing
median_imputations_dict = None

# Apply preprocessing to X_train (where medians for imputation are learned)
X_train, median_imputations_dict = apply_preprocessing(X_train, is_training_set=True)
# Apply preprocessing to X_cv and X_test (using medians learned from X_train)
X_cv, _ = apply_preprocessing(X_cv, median_imputations=median_imputations_dict)
X_test, _ = apply_preprocessing(X_test, median_imputations=median_imputations_dict)
gc.collect() # Clean up memory

print('Stage 1: Feature Engineering applied to all splits.')
print(f"X_train shape after initial FE: {X_train.shape}")
print(f"X_train columns after initial FE (first 10): {X_train.columns.tolist()[:10]}...")




Step 1: Applying feature engineering to all data splits...
Stage 1: Feature Engineering applied to all splits.
X_train shape after initial FE: (5348977, 43)
X_train columns after initial FE (first 10): ['id', 'client_id', 'card_id', 'amount', 'use_chip', 'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc']...


In [8]:
print("Step 2: Dropping all unnecessary columns...")

# This is the complete master list of columns to drop, consolidated from our discussions.
final_cols_to_drop = [
    # Identifiers and Sensitive Data
    'id', 'client_id', 'card_id', 'merchant_id',
    'card_number', 'cvv','mcc'

    # Problematic Date Columns (and their direct derivatives if still present)
    'acct_open_date', 'year_pin_last_changed',
    # 'account_age_days', 'years_since_pin_change' should be gone if derived from above

    # Redundant/Low-Value Categoricals/Text
    'card_on_dark_web', # This was a single-value column, so its OHE version 'card_on_dark_web_No' would also be single-value.
    'has_chip', # Replaced by 'has_chip_binary'
    'address', # High cardinality text, not used for FE
    'merchant_city', # High cardinality categorical, often redundant with zip/state

    # Redundant Age Features
    'birth_year',
    'birth_month',

    # Geospatial (dropped as I decided against complex geospatial FE)
    'latitude',
    'longitude',

    # Original Date Columns (replaced by extracted features)
    # These should be dropped by apply_preprocessing, but included here for robustness if they somehow remain.
    'date', 'expires',
    # 'hour_of_day', 'day_of_week', 'month' should also be dropped by apply_preprocessing
]

for df_set in [X_train, X_cv, X_test]:
    # Filter list to only drop columns that actually exist in the DataFrame
    cols_that_exist = [col for col in final_cols_to_drop if col in df_set.columns]
    df_set.drop(columns=cols_that_exist, inplace=True, errors='ignore') # Use errors='ignore' for robustness

print('Stage 2: Unnecessary columns dropped.')
print(f"X_train shape after dropping: {X_train.shape}")
print(f"X_train columns after dropping (first 10): {X_train.columns.tolist()[:10]}...")


Step 2: Dropping all unnecessary columns...
Stage 2: Unnecessary columns dropped.
X_train shape after dropping: (5348977, 28)
X_train columns after dropping (first 10): ['amount', 'use_chip', 'merchant_state', 'zip', 'mcc', 'errors', 'client_id_card', 'card_brand', 'card_type', 'num_cards_issued']...


In [9]:
from sklearn.preprocessing import OneHotEncoder
print("Step 3: Grouping and one-hot encoding...")

# --- Handle merchant_state: Grouping ---
# Group merchant_state based on training set fraud counts (CRUCIAL for no data leakage)
# This part needs to be outside the loop and use y_train.
# Ensure 'merchant_state' is clean (NaNs filled) before this step.
temp_train_df = pd.DataFrame({'merchant_state': X_train['merchant_state'], 'is_fraud': y_train})
fraud_counts = temp_train_df[temp_train_df['is_fraud'] == 1]['merchant_state'].value_counts()
top_15_fraud_states = fraud_counts.nlargest(15).index.tolist()
del temp_train_df, fraud_counts # Clean up temporary data

# Apply grouping to all splits using the list derived from X_train
for df_set in [X_train, X_cv, X_test]:
    if 'merchant_state' in df_set.columns:
        # Use .loc for safe assignment
        df_set.loc[:, 'merchant_state'] = df_set['merchant_state'].apply(lambda x: x if x in top_15_fraud_states else 'OTHER_STATE')


# --- One-Hot Encode all remaining 'object' type columns ---
# This includes 'merchant_state', 'errors', 'card_brand', 'card_type', 'description', 'use_chip'
# and 'gender' if they are still object type.
categorical_cols_to_encode = X_train.select_dtypes(include=['object']).columns.tolist()
print(f'Categorical columns to encode: {categorical_cols_to_encode}')

# Initialize OneHotEncoder (fit only on X_train to prevent data leakage)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=np.int8)
encoder.fit(X_train[categorical_cols_to_encode])

# Transform and concatenate for all splits
encoded_cols_train = pd.DataFrame(encoder.transform(X_train[categorical_cols_to_encode]), index=X_train.index, columns=encoder.get_feature_names_out(categorical_cols_to_encode))
encoded_cols_cv = pd.DataFrame(encoder.transform(X_cv[categorical_cols_to_encode]), index=X_cv.index, columns=encoder.get_feature_names_out(categorical_cols_to_encode))
encoded_cols_test = pd.DataFrame(encoder.transform(X_test[categorical_cols_to_encode]), index=X_test.index, columns=encoder.get_feature_names_out(categorical_cols_to_encode))

X_train = pd.concat([X_train.drop(columns=categorical_cols_to_encode), encoded_cols_train], axis=1)
X_cv = pd.concat([X_cv.drop(columns=categorical_cols_to_encode), encoded_cols_cv], axis=1)
X_test = pd.concat([X_test.drop(columns=categorical_cols_to_encode), encoded_cols_test], axis=1)

gc.collect() # Clean up memory

print('Stage 3: Grouping and one-hot encoding complete.')
print(f"X_train shape after OHE: {X_train.shape}")
print(f"X_train columns after OHE (first 10): {X_train.columns.tolist()[:10]}...")



Step 3: Grouping and one-hot encoding...
Categorical columns to encode: ['use_chip', 'merchant_state', 'errors', 'card_brand', 'card_type', 'description']
Stage 3: Grouping and one-hot encoding complete.
X_train shape after OHE: (5348977, 178)
X_train columns after OHE (first 10): ['amount', 'zip', 'mcc', 'client_id_card', 'num_cards_issued', 'credit_limit', 'current_age', 'retirement_age', 'gender', 'per_capita_income']...


In [10]:
print("Step 4: Downcasting data types for memory efficiency...")
for df_set in [X_train, X_cv, X_test]:
    for col in df_set.select_dtypes(include=['float64', 'int64']).columns:
        if 'float' in str(df_set[col].dtype):
            df_set.loc[:, col] = df_set[col].astype('float32')
        else:
            df_set.loc[:, col] = pd.to_numeric(df_set[col], downcast='integer')
gc.collect() # Clean up memory

print("\n--- Preprocessing Fully Complete ---")
print(f"X_train final shape: {X_train.shape}")
print(f"X_cv final shape: {X_cv.shape}")
print(f"X_test final shape: {X_test.shape}")

Step 4: Downcasting data types for memory efficiency...

--- Preprocessing Fully Complete ---
X_train final shape: (5348977, 178)
X_cv final shape: (1782993, 178)
X_test final shape: (1782993, 178)


In [11]:
print("\n--- Final NaN Check Across ALL Remaining Columns ---")
nan_counts_xtrain = X_train.isna().sum()
print(f"\nX_train NaNs:")
print(nan_counts_xtrain[nan_counts_xtrain > 0]) # Should be empty!

nan_counts_xcv = X_cv.isna().sum()
print(f"\nX_cv NaNs:")
print(nan_counts_xcv[nan_counts_xcv > 0])

nan_counts_test = X_test.isna().sum()
print(f"\nX_test NaNs:")
print(nan_counts_test[nan_counts_test > 0])

if nan_counts_xtrain[nan_counts_xtrain > 0].empty and \
   nan_counts_xcv[nan_counts_xcv > 0].empty and \
   nan_counts_test[nan_counts_test > 0].empty:
    print("\nAll DataFrames are clean (no NaNs found). Ready for modeling!")
else:
    print("\nWARNING: NaNs still present in DataFrames. Please review preprocessing steps.")


--- Final NaN Check Across ALL Remaining Columns ---

X_train NaNs:
Series([], dtype: int64)

X_cv NaNs:
Series([], dtype: int64)

X_test NaNs:
Series([], dtype: int64)

All DataFrames are clean (no NaNs found). Ready for modeling!


In [12]:
#Model training using XGBoost Classifier

# Calculate scale_pos_weight from the training target variable
neg_count = y_train.value_counts()[0]
pos_count = y_train.value_counts()[1]
scale_pos_weight_value = neg_count / pos_count
print(f"Count of Legitimate Transactions (0) in y_train: {neg_count}")
print(f"Count of Fraudulent Transactions (1) in y_train: {pos_count}")
print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.2f}\n")

Count of Legitimate Transactions (0) in y_train: 5340978
Count of Fraudulent Transactions (1) in y_train: 7999
Calculated scale_pos_weight: 667.71



In [13]:
import xgboost as xgb

# Initialize and Train XGBoost Classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    eval_metric='logloss',        # Metric for evaluation during training
    use_label_encoder=False,      # Suppress a future deprecation warning
    scale_pos_weight=scale_pos_weight_value, # Crucial for imbalance
    random_state=42,              # For reproducibility
    n_estimators=500,             # Number of boosting rounds (trees)
    learning_rate=0.05,           # Step size shrinkage to prevent overfitting
    max_depth=5,                  # Maximum depth of a tree
    subsample=0.7,                # Subsample ratio of the training instance
    colsample_bytree=0.7,         # Subsample ratio of columns when constructing each tree
    gamma=0.1                     # Minimum loss reduction required to make a further partition
)

print("Training XGBoost model...")
xgb_model.fit(X_train, y_train)
print("Model training complete.\n")

# Evaluate on Validation Set
y_pred_cv = xgb_model.predict(X_cv)
y_proba_cv = xgb_model.predict_proba(X_cv)[:, 1] # Probabilities for the positive class

print("--- Evaluation on Validation Set (X_cv) ---")
print("\nConfusion Matrix:")
print(confusion_matrix(y_cv, y_pred_cv))
print("\nClassification Report:")
print(classification_report(y_cv, y_pred_cv))
roc_auc = roc_auc_score(y_cv, y_proba_cv)
print(f"\nROC AUC Score: {roc_auc:.4f}")

# Display Feature Importances
print("\nFeature Importances (Top 15):")
feature_importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
print(feature_importances.nlargest(15))

Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model training complete.

--- Evaluation on Validation Set (X_cv) ---

Confusion Matrix:
[[1744201   36125]
 [    178    2489]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1780326
           1       0.06      0.93      0.12      2667

    accuracy                           0.98   1782993
   macro avg       0.53      0.96      0.56   1782993
weighted avg       1.00      0.98      0.99   1782993


ROC AUC Score: 0.9928

Feature Importances (Top 15):
merchant_state_Italy                                               0.123290
description_Tolls and Bridge Fees                                  0.108466
use_chip_Online Transaction                                        0.104134
description_Taxicabs and Limousines                                0.031447
zip                                                                0.030257
merchant_state_Haiti                                               0.030215
merchant_st

In [14]:
# Get feature importances from the model I just trained
feature_importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)

# Identify the least important features to drop for tuning.
# I'll drop the bottom 10 features for this example. This number can be adjusted.
low_importance_features = feature_importances.nsmallest(10).index.tolist()

print(f"Identified {len(low_importance_features)} features with the lowest importance to drop for tuning.")

# Create reduced DataFrames for the search.
# RandomizedSearchCV will run on these smaller datasets to save time and memory.
X_train_model2 = X_train.drop(columns=low_importance_features, errors='ignore')
X_cv_model2 = X_cv.drop(columns=low_importance_features, errors='ignore')
X_test_model2=X_test.drop(columns=low_importance_features, errors='ignore')

del X_train,X_cv,X_test
gc.collect()

print(f"New shape of X_train for tuning: {X_train_model2.shape}")
print(f"New shape of X_cv for tuning: {X_cv_model2.shape}")
print(f"New shape of X_test for tuning: {X_test_model2.shape}")

Identified 10 features with the lowest importance to drop for tuning.
New shape of X_train for tuning: (5348977, 168)
New shape of X_cv for tuning: (1782993, 168)
New shape of X_test for tuning: (1782993, 168)


In [15]:
print("\n--- Creating Interaction Features for Model 2's Reduced Datasets ---")

# List of all dataframes to apply interactions to (these are the already reduced ones)
dfs_for_interactions_model2 = [X_train_model2, X_cv_model2, X_test_model2]

for df_set in dfs_for_interactions_model2:
    # --- Amount x Top Categorical/Binary Features ---
    # Check if base columns exist and are numeric before creating interactions.

    if all(col in df_set.columns for col in ['amount', 'merchant_state_Italy']):
        df_set['amount_x_state_italy'] = df_set['amount'] * df_set['merchant_state_Italy']
    if all(col in df_set.columns for col in ['amount', 'description_Tolls and Bridge Fees']):
        df_set['amount_x_tolls'] = df_set['amount'] * df_set['description_Tolls and Bridge Fees']
    if all(col in df_set.columns for col in ['amount', 'use_chip_Online Transaction']):
        df_set['amount_x_online_trans'] = df_set['amount'] * df_set['use_chip_Online Transaction']
    if all(col in df_set.columns for col in ['amount', 'use_chip_Swipe Transaction']):
        df_set['amount_x_swipe_trans'] = df_set['amount'] * df_set['use_chip_Swipe Transaction']
    if all(col in df_set.columns for col in ['amount', 'merchant_state_Haiti']):
        df_set['amount_x_state_haiti'] = df_set['amount'] * df_set['merchant_state_Haiti']
    if all(col in df_set.columns for col in ['amount', 'description_Taxicabs and Limousines']):
        df_set['amount_x_taxis_limos'] = df_set['amount'] * df_set['description_Taxicabs and Limousines']
    if all(col in df_set.columns for col in ['amount', 'use_chip_Chip Transaction']):
        df_set['amount_x_chip_trans'] = df_set['amount'] * df_set['use_chip_Chip Transaction']
    if all(col in df_set.columns for col in ['amount', 'merchant_state_OTHER_STATE']):
        df_set['amount_x_state_other'] = df_set['amount'] * df_set['merchant_state_OTHER_STATE']

    # --- Other Interaction Ideas (Non-Amount Based) ---
    if all(col in df_set.columns for col in ['credit_score', 'use_chip_Online Transaction']):
        df_set['credit_score_x_online_trans'] = df_set['credit_score'] * df_set['use_chip_Online Transaction']

    # Debt-to-Income Ratio (handle division by zero if yearly_income can be 0)
    if all(col in df_set.columns for col in ['total_debt', 'yearly_income']):
        # Add a small epsilon to yearly_income to prevent division by zero
        df_set['debt_to_income_ratio'] = df_set['total_debt'] / (df_set['yearly_income'] + 1e-6)

print("Interaction features created for Model 2 datasets.")
print(f"X_train_model2 shape is now: {X_train_model2.shape}")
print(f"X_cv_model2 shape is now: {X_cv_model2.shape}")
print(f"X_test_model2 shape is now: {X_test_model2.shape}")


--- Creating Interaction Features for Model 2's Reduced Datasets ---
Interaction features created for Model 2 datasets.
X_train_model2 shape is now: (5348977, 178)
X_cv_model2 shape is now: (1782993, 178)
X_test_model2 shape is now: (1782993, 178)


In [16]:
# model 2training again with xgboost classifier

# Initialize and Train XGBoost Classifier
xgb_model_2 = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    eval_metric='logloss',        # Metric for evaluation during training
    use_label_encoder=False,      # Suppress a future deprecation warning
    scale_pos_weight=scale_pos_weight_value, # Crucial for imbalance
    random_state=42,              # For reproducibility
    n_estimators=500,             # Number of boosting rounds (trees)
    learning_rate=0.05,           # Step size shrinkage to prevent overfitting
    max_depth=5,                  # Maximum depth of a tree
    subsample=0.7,                # Subsample ratio of the training instance
    colsample_bytree=0.7,         # Subsample ratio of columns when constructing each tree
    gamma=0.1                     # Minimum loss reduction required to make a further partition
)

print("Training XGBoost model...")
xgb_model_2.fit(X_train_model2, y_train)
print("Model training complete.\n")

# Evaluate on Validation Set
y_pred_cv = xgb_model_2.predict(X_cv_model2)
y_proba_cv = xgb_model_2.predict_proba(X_cv_model2)[:, 1] # Probabilities for the positive class

print("--- Evaluation on Validation Set (X_cv_model2) ---")
print("\nConfusion Matrix:")
print(confusion_matrix(y_cv, y_pred_cv))
print("\nClassification Report:")
print(classification_report(y_cv, y_pred_cv))
roc_auc = roc_auc_score(y_cv, y_proba_cv)
print(f"\nROC AUC Score: {roc_auc:.4f}")

# Display Feature Importances
print("\nFeature Importances (Top 15):")
feature_importances = pd.Series(xgb_model_2.feature_importances_, index=X_train_model2.columns)
print(feature_importances.nlargest(15))

Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model training complete.

--- Evaluation on Validation Set (X_cv_model2) ---

Confusion Matrix:
[[1744799   35527]
 [    178    2489]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1780326
           1       0.07      0.93      0.12      2667

    accuracy                           0.98   1782993
   macro avg       0.53      0.96      0.56   1782993
weighted avg       1.00      0.98      0.99   1782993


ROC AUC Score: 0.9927

Feature Importances (Top 15):
use_chip_Online Transaction                                        0.110121
merchant_state_Italy                                               0.106560
description_Tolls and Bridge Fees                                  0.096222
amount_x_state_italy                                               0.060888
description_Taxicabs and Limousines                                0.032930
merchant_state_Haiti                                               0.030947
amou

In [None]:
import joblib


joblib.dump(xgb_model, 'xgb_model.pkl')

['xgb_model.pkl']

In [20]:
joblib.dump(xgb_model_2, 'xgb_model_2.pkl')

['xgb_model_2.pkl']

In [18]:
# --- Section 8: Hyperparameter Tuning with Randomized Search (Memory-Optimized) ---

print("\n--- Starting Hyperparameter Tuning with Randomized Search (Memory-Optimized) ---")

# --- IMPORTANT: Memory Management for Tuning ---
# Create a smaller sample of your training data for tuning.
# This is the key to avoiding MemoryError during cross-validation,
# as RandomizedSearchCV will train on this smaller subset.
SAMPLE_SIZE_FOR_TUNING = 2500000 # Adjust this size based on your RAM. 500k-1M is often a good starting point.
                               # This will be roughly 1/10th of your training data.

# Ensure y_train is aligned with the sample of X_train_model2_base
# Use .copy() to ensure these are independent samples.
X_train_sample_for_tuning = X_train_model2.sample(n=SAMPLE_SIZE_FOR_TUNING, random_state=42).copy()
y_train_sample_for_tuning = y_train.loc[X_train_sample_for_tuning.index].copy() # Ensure y is aligned


print(f"Created a tuning sample of size: {X_train_sample_for_tuning.shape}")


# --- 1. Define the Parameter Grid ---
# These are the hyperparameters I want to tune.
param_distributions = {
    'n_estimators': [500, 750,1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 6, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2]
}

# --- 2. Initialize the XGBoost Classifier and RandomizedSearchCV ---
# I use the fixed parameters like scale_pos_weight from before.
base_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    # use_label_encoder=False, # REMOVE THIS LINE (it's obsolete and causes a UserWarning)
    scale_pos_weight=scale_pos_weight_value, # This is the value calculated before
    random_state=42
)

# Set up the search
# n_iter=10 (reduced for faster search on sample)
# cv=3 (keep 3-fold CV for robustness on sample)
random_search = RandomizedSearchCV(
    estimator=base_xgb,
    param_distributions=param_distributions,
    n_iter=10,         # Number of parameter combinations to try
    cv=3,              # 3-fold cross-validation
    scoring='roc_auc', # The best metric for this problem
    verbose=2,         # This will print progress updates
    random_state=42,
    n_jobs=1          # Use all available CPU cores
)

# --- 3. Run the Search ---
print("\nRunning RandomizedSearchCV on SAMPLE training data...")
# FIT ON THE SAMPLE DATASET HERE:
random_search.fit(X_train_sample_for_tuning, y_train_sample_for_tuning)
print("--- Search Complete ---")

# --- 4. Analyze the Results ---
print(f"\nBest ROC AUC score found on sample: {random_search.best_score_:.4f}")
print("Best parameters found on sample:")
print(random_search.best_params_)

# Get the best model from the search (this model is trained on the sample)
best_xgb_model = random_search.best_estimator_


--- Starting Hyperparameter Tuning with Randomized Search (Memory-Optimized) ---
Created a tuning sample of size: (2500000, 178)

Running RandomizedSearchCV on SAMPLE training data...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=6, n_estimators=500, subsample=0.8; total time= 4.9min
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=6, n_estimators=500, subsample=0.8; total time= 6.2min


KeyboardInterrupt: 

In [None]:
# Evaluate the best model on the validation set (X_cv)
print("\n--- Evaluation of Best Tuned Model on Validation Set (X_cv) ---")
y_pred_cv_tuned = best_xgb_model.predict(X_cv_model2) # Use X_cv_reduced for evaluation
y_proba_cv_tuned = best_xgb_model.predict_proba(X_cv_model2)[:, 1]

print("\nConfusion Matrix:")
print(confusion_matrix(y_cv, y_pred_cv_tuned))
print("\nClassification Report:")
print(classification_report(y_cv, y_pred_cv_tuned))
roc_auc_tuned = roc_auc_score(y_cv, y_proba_cv_tuned)
print(f"\nROC AUC Score: {roc_auc_tuned:.4f}")

# Display Feature Importances for the best model
print("\nFeature Importances (Top 20 from Tuned Model):")
feature_importances_tuned = pd.Series(best_xgb_model.feature_importances_, index=X_train_model2.columns) # Use X_train_model2 columns
print(feature_importances_tuned.nlargest(20))

In [None]:
# --- AFTER the evaluation of best_xgb_model on X_cv_model2 ---

print("\n--- Evaluation of Best Tuned Model with Custom Threshold (0.7) ---")

# Define the new custom threshold
custom_threshold = 0.7

# Apply the custom threshold to the probabilities from the best tuned model
# If the probability of fraud (y_proba_cv_tuned) is greater than the custom_threshold,
# classify as 1 (fraud), otherwise classify as 0 (not fraud).
y_pred_cv_thresholded = (y_proba_cv_tuned > custom_threshold).astype(int)

print(f"\nConfusion Matrix (Threshold = {custom_threshold}):")
print(confusion_matrix(y_cv, y_pred_cv_thresholded))

print(f"\nClassification Report (Threshold = {custom_threshold}):")
print(classification_report(y_cv, y_pred_cv_thresholded))

# ROC AUC is threshold-independent, so it won't change by adjusting the prediction threshold.
# It's still useful to print it to confirm the model's underlying discriminative power.
roc_auc_thresholded = roc_auc_score(y_cv, y_proba_cv_tuned) # Note: Still uses probabilities, not thresholded predictions
print(f"\nROC AUC Score (Threshold = {custom_threshold}): {roc_auc_thresholded:.4f}")

print("\nAnalysis of Custom Threshold:")
print(f"By increasing the classification threshold to {custom_threshold}, I expect to see a significant decrease in False Positives (leading to higher Precision) and potentially a decrease in True Positives (leading to lower Recall). This trade-off is crucial for aligning the model's output with specific business requirements, such as reducing the number of false alarms for fraud investigation teams.")

In [None]:
# --- AFTER the evaluation of best_xgb_model on X_cv_model2 ---

print("\n--- Evaluation of Best Tuned Model with Custom Threshold (0.15) ---")

# Define the new custom threshold
custom_threshold = 0.15

# Apply the custom threshold to the probabilities from the best tuned model
# If the probability of fraud (y_proba_cv_tuned) is greater than the custom_threshold,
# classify as 1 (fraud), otherwise classify as 0 (not fraud).
y_pred_cv_thresholded = (y_proba_cv_tuned > custom_threshold).astype(int)

print(f"\nConfusion Matrix (Threshold = {custom_threshold}):")
print(confusion_matrix(y_cv, y_pred_cv_thresholded))

print(f"\nClassification Report (Threshold = {custom_threshold}):")
print(classification_report(y_cv, y_pred_cv_thresholded))

# ROC AUC is threshold-independent, so it won't change by adjusting the prediction threshold.
# It's still useful to print it to confirm the model's underlying discriminative power.
roc_auc_thresholded = roc_auc_score(y_cv, y_proba_cv_tuned) # Note: Still uses probabilities, not thresholded predictions
print(f"\nROC AUC Score (Threshold = {custom_threshold}): {roc_auc_thresholded:.4f}")

print("\nAnalysis of Custom Threshold:")
print(f"By decreasing the classification threshold to {custom_threshold}, I expect to see a significant decrease in False Negatives (leading to higher Recall) and potentially a increase in True Positives (leading to lower Precidsion). This trade-off is crucial for aligning the model's output with specific business requirements, such as reducing the number of false alarms for fraud investigation teams.")

In [None]:
# Get the results from RandomizedSearchCV
results = random_search.cv_results_
results_df = pd.DataFrame(results)

# Plotting function to visualize tuning results
def plot_tuning_results(results_df, param_name, title):
    plt.figure(figsize=(10, 6))

    # Get the mean test score for each combination
    scores = results_df['mean_test_score']

    # Get the parameter values
    param_values = results_df[f'param_{param_name}']

    # Check if the parameter is a numerical type before plotting
    if pd.api.types.is_numeric_dtype(param_values):
        # Sort values for a cleaner plot
        sorted_results = results_df.sort_values(by=f'param_{param_name}')

        plt.plot(sorted_results[f'param_{param_name}'], sorted_results['mean_test_score'], marker='o')

    else:
        # If not numeric (e.g., a string), plot a swarmplot or similar
        sns.swarmplot(x=f'param_{param_name}', y='mean_test_score', data=results_df)

    plt.title(title, fontsize=16)
    plt.xlabel(param_name, fontsize=12)
    plt.ylabel('Mean ROC AUC Score', fontsize=12)
    plt.grid(True)
    plt.show()

# Let's visualize the performance against one or two key parameters
# Choose from ['n_estimators', 'learning_rate', 'max_depth', 'subsample', 'colsample_bytree', 'gamma']

# Plot ROC AUC vs. learning_rate
plot_tuning_results(results_df, 'learning_rate', 'Tuning Results: ROC AUC vs. Learning Rate')

# Plot ROC AUC vs. max_depth
plot_tuning_results(results_df, 'max_depth', 'Tuning Results: ROC AUC vs. Max Depth')

print("The plots above show how performance varied with different parameter settings during the search.")

In [None]:
# Make sure to use the best_xgb_model obtained from RandomizedSearchCV
# And use X_test_reduced if you applied the low-importance feature dropping.

print("\n--- Final Evaluation on Test Set (X_test) ---")

y_pred_test = best_xgb_model.predict(X_test_model2) # Use X_test_reduced for evaluation
y_proba_test = best_xgb_model.predict_proba(X_test_model2)[:, 1]

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))
roc_auc_test = roc_auc_score(y_test, y_proba_test)
print(f"\nROC AUC Score: {roc_auc_test:.4f}")

# Display Feature Importances for the best model on the test set
print("\nFeature Importances (Top 20 from Best Model):")
feature_importances_final = pd.Series(best_xgb_model.feature_importances_, index=X_train_model2.columns)
print(feature_importances_final.nlargest(20))

Despite the strong performance, there's always room for further exploration and improvement in a real-world fraud detection system:

Threshold Optimization: Implement a detailed threshold tuning process (e.g., plotting Precision-Recall curves) to find the optimal balance between precision and recall based on specific business costs of false positives vs. false negatives. This is crucial for practical deployment.

Advanced Feature Engineering:

Explore more complex velocity features (e.g., number of transactions per card/client/merchant in the last hour/day/week).
Investigate advanced geospatial features if more granular location data is available.

Other Models: Experiment with other advanced ensemble models (e.g., CatBoost) or deep learning approaches for tabular data.

Anomaly Detection: Integrate unsupervised anomaly detection techniques to identify novel fraud patterns that supervised models might miss.

Cost-Sensitive Learning: Directly incorporate the financial costs of misclassifications into the model's loss function.

Real-time Considerations: For a production system, consider aspects like model latency, data streaming, and continuous learning.