In [19]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import ADASYN

# Load processed data
train_df = pd.read_csv('../data/train_processed_transactions.csv')
val_df = pd.read_csv('../data/val_processed_transactions.csv')
test_df = pd.read_csv('../data/test_processed_transactions.csv')

# Ensure all expected features are present
expected_features = ['Amount', 'hour', 'dayofweek', 'txns_last_24h', 'amount_last_24h', 'risk_score']
missing_features = [f for f in expected_features if f not in train_df.columns]

if missing_features:
    raise ValueError(f"Missing expected features in dataset: {missing_features}")

# Feature selection
X_train, y_train = train_df[expected_features], train_df['Class']
X_val, y_val = val_df[expected_features], val_df['Class']
X_test, y_test = test_df[expected_features], test_df['Class']

### Apply ADASYN for Oversampling Fraud Cases ###
adasyn = ADASYN(random_state=42, n_neighbors=5)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

# **LightGBM Configuration**
best_model = LGBMClassifier(
    n_estimators=1000,  
    max_depth=12,  
    learning_rate=0.05,
    num_leaves=50,  
    min_gain_to_split=0.0,  
    min_child_samples=1,  
    min_data_in_leaf=1, 
    reg_alpha=0,  
    reg_lambda=0,  
    colsample_bytree=1.0,  
    subsample=1.0,  
    force_col_wise=True,  
    verbose=-1,  
    random_state=42
)

# Train Model
best_model.fit(X_train_balanced, y_train_balanced)
y_val_pred = best_model.predict(X_val)

# Evaluate Model
final_prec = precision_score(y_val, y_val_pred)
final_rec = recall_score(y_val, y_val_pred)
final_f1 = f1_score(y_val, y_val_pred)
final_acc = accuracy_score(y_val, y_val_pred)

print("\nFinal Validation Performance:")
print(f"Precision: {final_prec:.3f}, Recall: {final_rec:.3f}, F1: {final_f1:.3f}, Accuracy: {final_acc:.3f}")

# Save Model
joblib.dump(best_model, '../src/fraud_detection_model.pkl')

print("Model training complete. Best model saved.")



Final Validation Performance:
Precision: 1.000, Recall: 0.935, F1: 0.966, Accuracy: 1.000
Model training complete. Best model saved.
