In [61]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report


In [62]:
file_path = "fraudTrain.csv"  
df = pd.read_csv(file_path)

In [63]:
# drop cols
drop_cols = ["Unnamed: 0", "trans_num", "first", "last", "street", "cc_num", "unix_time"]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

# Convert datetime features
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["dob"] = pd.to_datetime(df["dob"])
df["hour"] = df["trans_date_trans_time"].dt.hour
df["day"] = df["trans_date_trans_time"].dt.day
df["weekday"] = df["trans_date_trans_time"].dt.weekday
df["month"] = df["trans_date_trans_time"].dt.month
df["age"] = df["trans_date_trans_time"].dt.year - df["dob"].dt.year

df.drop(columns=["dob", "trans_date_trans_time"], inplace=True)

In [64]:
# distance between transaction location and merchant location
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

df["distance"] = haversine(df["lat"], df["long"], df["merch_lat"], df["merch_long"])
df.drop(columns=["lat", "long", "merch_lat", "merch_long"], inplace=True)

In [65]:
# Categorical Encoding Strategy
categorical_cols = ["merchant", "category", "gender", "state", "job", "city"]

# Define encoding methods per feature
one_hot_cols = ["category", "gender", "state"]
target_cols = ["merchant", "job"]
freq_cols = ["city"]

# One-Hot Encoding
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_array = ohe.fit_transform(df[one_hot_cols])
encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out(one_hot_cols))

df = df.drop(columns=one_hot_cols).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)

# Target Encoding (Fraud Rate Per Category)
encoders = {}
for col in target_cols:
    encoders[col] = df.groupby(col)["is_fraud"].mean()
    df[col] = df[col].map(encoders[col]).fillna(df["is_fraud"].mean())  # Unseen categories get global fraud rate

# Frequency Encoding
for col in freq_cols:
    encoders[col] = df[col].value_counts(normalize=True)
    df[col] = df[col].map(encoders[col]).fillna(0)  # Unseen categories get 0

In [66]:
X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]

# split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# normalize
scaler = StandardScaler()
num_cols = ["amt", "city_pop", "distance", "age"]
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])

In [67]:
# train
xgb_model = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    tree_method="hist",  # Default method
    device="cuda"  # Enable GPU
)


In [68]:
# hyperparameters
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

grid_search = GridSearchCV(xgb_model, param_grid, scoring="f1", cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

Fitting 3 folds for each of 72 candidates, totalling 216 fits


Parameters: { "use_label_encoder" } are not used.



In [69]:
# evaluate on validation set
y_pred = best_model.predict(X_val)
print("Classification Report:\n", classification_report(y_val, y_pred))


print("Best Hyperparameters:", best_params)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.98      0.88      0.92      1501

    accuracy                           1.00    259335
   macro avg       0.99      0.94      0.96    259335
weighted avg       1.00      1.00      1.00    259335

Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}


In [70]:
# Load test dataset in chunks
file_path_test = "fraudTest.csv"
chunk_size = 10000  # Process 10,000 rows at a time

In [None]:
def preprocess_data(chunk, encoders, scaler, categorical_cols, num_cols):
    """Preprocesses test data, applying encoding & normalization."""
    chunk.drop(columns=drop_cols, errors="ignore", inplace=True)

    # Convert datetime features
    chunk["trans_date_trans_time"] = pd.to_datetime(chunk["trans_date_trans_time"])
    chunk["dob"] = pd.to_datetime(chunk["dob"])
    chunk["hour"] = chunk["trans_date_trans_time"].dt.hour
    chunk["day"] = chunk["trans_date_trans_time"].dt.day
    chunk["weekday"] = chunk["trans_date_trans_time"].dt.weekday
    chunk["month"] = chunk["trans_date_trans_time"].dt.month
    chunk["age"] = chunk["trans_date_trans_time"].dt.year - chunk["dob"].dt.year
    chunk.drop(columns=["dob", "trans_date_trans_time"], inplace=True, errors="ignore")

    # distance
    chunk["distance"] = haversine(chunk["lat"], chunk["long"], chunk["merch_lat"], chunk["merch_long"])
    chunk.drop(columns=["lat", "long", "merch_lat", "merch_long"], inplace=True, errors="ignore")

    # one hot encoding
    encoded_array = ohe.transform(chunk[one_hot_cols])
    encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out(one_hot_cols))

    chunk = chunk.drop(columns=one_hot_cols).reset_index(drop=True)
    chunk = pd.concat([chunk, encoded_df], axis=1)

    # target and freq encoding
    for col in target_cols + freq_cols:
        if col in encoders:
            chunk[col] = chunk[col].map(encoders[col]).fillna(0 if col in freq_cols else df["is_fraud"].mean())

    # normalize
    chunk[num_cols] = scaler.transform(chunk[num_cols])

    return chunk

In [None]:
results = []

for chunk in pd.read_csv(file_path_test, chunksize=chunk_size):
    chunk = preprocess_data(chunk, encoders, scaler, categorical_cols, num_cols)
    X_test_chunk = chunk.drop(columns=["is_fraud"])
    y_test_chunk = chunk["is_fraud"]
    y_pred_chunk = best_model.predict(X_test_chunk)

    results.append(classification_report(y_test_chunk, y_pred_chunk, output_dict=True, zero_division=0))

# aggregate reports
final_summary = {}

for label in ["0", "1", "accuracy", "macro avg", "weighted avg"]:
    if label in results[0]:  # Ensure label exists in at least one chunk
        if isinstance(results[0].get(label, {}), dict):  
            final_summary[label] = {
                key: np.mean([r.get(label, {}).get(key, 0) for r in results])  
                for key in ["precision", "recall", "f1-score", "support"]
            }
        else:
            final_summary[label] = np.mean([r.get(label, 0) for r in results])  


final_report_df = pd.DataFrame(final_summary).T

# Print Final Test Performance
print("Final Test Set Performance:")
print(final_report_df.to_string())  # Prevent truncation

Final Test Set Performance:
              precision    recall  f1-score      support
0              0.998370  0.999876  0.999122  9885.250000
1              0.865713  0.539680  0.654513    38.303571
accuracy       0.998252  0.998252  0.998252     0.998252
macro avg      0.949899  0.787635  0.844675  9923.553571
weighted avg   0.998186  0.998252  0.998003  9923.553571
