# Fraud Detection: EDA and Model Training

This notebook covers:
1. Exploratory Data Analysis (EDA) of the synthetic transaction data.
2. Feature Engineering.
3. Training a Supervised Model (XGBoost).
4. Training an Unsupervised Model (Isolation Forest).
5. Saving models for the Streamlit app.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pickle
import os

# Load Data
df = pd.read_csv('../data/transactions.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.head()

## 1. Exploratory Data Analysis

In [None]:
# Check Class Imbalance
print(df['is_fraud'].value_counts(normalize=True))
sns.countplot(x='is_fraud', data=df)
plt.title('Class Distribution')
plt.show()

In [None]:
# Time patterns
df['hour'] = df['timestamp'].dt.hour
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='hour', hue='is_fraud', common_norm=False, stat='density')
plt.title('Transaction Hour Distribution by Class')
plt.show()

## 2. Feature Engineering

In [None]:
# Simple features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Drop non-numeric for simple models (in real scenario, we'd encode IDs or use graph features)
X = df[['amount', 'hour', 'day_of_week']]
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 3. Supervised Learning (XGBoost)

In [None]:
model_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_xgb.fit(X_train, y_train)

y_pred_xgb = model_xgb.predict(X_test)
print("XGBoost Performance:")
print(classification_report(y_test, y_pred_xgb))
print("Average Precision Score:", average_precision_score(y_test, y_pred_xgb))

## 4. Unsupervised Learning (Isolation Forest)

In [None]:
# Isolation Forest is trained on normal data usually, or mixed data assuming anomalies are rare
iso_forest = IsolationForest(contamination=0.02, random_state=42)
iso_forest.fit(X_train)

# Predict (returns -1 for outlier, 1 for inlier)
y_pred_iso = iso_forest.predict(X_test)
# Map to 0/1 (1 for fraud/outlier)
y_pred_iso_mapped = [1 if x == -1 else 0 for x in y_pred_iso]

print("Isolation Forest Performance:")
print(classification_report(y_test, y_pred_iso_mapped))

## 5. Save Models

In [None]:
os.makedirs('../models', exist_ok=True)
with open('../models/model_xgb.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)
    
with open('../models/model_iso.pkl', 'wb') as f:
    pickle.dump(iso_forest, f)
    
print("Models saved to ../models/")