# 02 - Modeling: Fraud Classification


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay
from src.data import DataPaths, ensure_dataset, load_dataframe_from_csv
from src.models import build_full_pipeline, TARGET_COLUMN
from sklearn.model_selection import train_test_split

paths = DataPaths()
csv_path = ensure_dataset(paths, generate_if_missing=True, n_rows=20000, seed=42)
df = load_dataframe_from_csv(csv_path)
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Train Logistic Regression
pl_log = build_full_pipeline('logreg')
pl_log.fit(X_train, y_train)
y_pred_log = pl_log.predict(X_test)
y_prob_log = pl_log.predict_proba(X_test)[:,1]

# Train Decision Tree
pl_tree = build_full_pipeline('tree')
pl_tree.fit(X_train, y_train)
y_pred_tree = pl_tree.predict(X_test)
y_prob_tree = pl_tree.predict_proba(X_test)[:,1]


In [None]:
from src.metrics import compute_classification_metrics

m_log = compute_classification_metrics(y_test, y_pred_log, y_prob_log)
m_tree = compute_classification_metrics(y_test, y_pred_tree, y_prob_tree)
{'logreg': m_log, 'tree': m_tree}


In [None]:
# ROC Curves
fig, ax = plt.subplots(figsize=(6,5))
RocCurveDisplay.from_predictions(y_test, y_prob_log, name='LogReg', ax=ax)
RocCurveDisplay.from_predictions(y_test, y_prob_tree, name='Tree', ax=ax)
plt.title('ROC Curves')
plt.tight_layout()
plt.show()


In [None]:
# Export a tidy results CSV like the CLI
results = X_test.copy()
results['y_true'] = y_test
results['y_prob_log'] = y_prob_log
results['y_prob_tree'] = y_prob_tree
results.to_csv('reports/fraud_results.csv', index=False)
with open('reports/metrics.json', 'w') as f:
    json.dump({'logreg': m_log, 'tree': m_tree}, f, indent=2)
'Exported to reports/'
