In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score
import joblib
from imblearn.over_sampling import SMOTE  # Import SMOTE

# Load processed data
train_df = pd.read_csv('../data/train_processed_transactions.csv')
val_df = pd.read_csv('../data/val_processed_transactions.csv')
test_df = pd.read_csv('../data/test_processed_transactions.csv')

# Define feature names
feature_names = ['Amount', 'hour', 'risk_score']

X_train, y_train = train_df[feature_names], train_df['Class']
X_val, y_val = val_df[feature_names], val_df['Class']
X_test, y_test = test_df[feature_names], test_df['Class']

### Apply SMOTE to Balance Dataset ###
smote = SMOTE(sampling_strategy=0.1, random_state=42)  # Create more fraud cases
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)  # Oversample minority class

# Convert DataFrames to DMatrix format for better performance
dtrain = xgb.DMatrix(X_train_smote, label=y_train_smote)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

# Define XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5,
    'learning_rate': 0.3,
}

# Train XGBoost Model with early stopping
xgb_model = xgb.train(
    params, 
    dtrain, 
    num_boost_round=200, 
    evals=[(dval, 'validation')], 
    early_stopping_rounds=10
)

# Predict with trained model
xgb_preds = xgb_model.predict(dtest)
xgb_preds = (xgb_preds > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluate models
xgb_accuracy = accuracy_score(y_test, xgb_preds)
print("XGBoost Accuracy:", xgb_accuracy)
print("Classification Report (XGBoost):\n", classification_report(y_test, xgb_preds))

# Save models
joblib.dump(xgb_model, '../src/xgb_model.pkl')

print("Model training complete. Models saved.")


[0]	validation-logloss:0.15238
[1]	validation-logloss:0.13201
[2]	validation-logloss:0.11722
[3]	validation-logloss:0.10665
[4]	validation-logloss:0.09877
[5]	validation-logloss:0.09288
[6]	validation-logloss:0.08872
[7]	validation-logloss:0.08498
[8]	validation-logloss:0.08213
[9]	validation-logloss:0.07991
[10]	validation-logloss:0.07841
[11]	validation-logloss:0.07692
[12]	validation-logloss:0.07604
[13]	validation-logloss:0.07523
[14]	validation-logloss:0.07451
[15]	validation-logloss:0.07359
[16]	validation-logloss:0.07291
[17]	validation-logloss:0.07250
[18]	validation-logloss:0.07204
[19]	validation-logloss:0.07165
[20]	validation-logloss:0.07149
[21]	validation-logloss:0.07109
[22]	validation-logloss:0.07067
[23]	validation-logloss:0.07022
[24]	validation-logloss:0.06999
[25]	validation-logloss:0.06963
[26]	validation-logloss:0.06922
[27]	validation-logloss:0.06890
[28]	validation-logloss:0.06849
[29]	validation-logloss:0.06796
[30]	validation-logloss:0.06775
[31]	validation-lo