## 🤖 Model Training - Fraud Detection

In [5]:
import os
import json
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    accuracy_score
)
from imblearn.over_sampling import SMOTE
import shutil

# === Global configs ===
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

# === Utility: Get version tag ===
def get_version_tag():
    return datetime.now().strftime("v%Y%m%d_%H%M")

# === Utility: Convert NumPy types for JSON logging ===
def convert_np(obj):
    if isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    if isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    return str(obj)

# === Step 1: Load Data ===
def load_data(filepath=r'D:\Data science\Projects\Fraud_detection\fraud_detection_app\data\Fraud.csv', sample_size=100000):
    print("📥 Loading data...")
    try:
        df_iter = pd.read_csv(filepath, chunksize=sample_size)
        df = next(df_iter)
        print(f"✅ Loaded {len(df)} rows.")
        return df
    except FileNotFoundError:
        raise Exception(f"❌ File not found: {filepath}")
    except Exception as e:
        raise Exception(f"❌ Failed to load data: {e}")

# === Step 2: Preprocess Data ===
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    print("🧹 Preprocessing data...")

    df['isFlaggedFraud'] = df['isFlaggedFraud'].astype('int8')
    df['amount'] = df['amount'].astype('float32')
    df['newbalanceOrig'] = df['newbalanceOrig'].astype('float32')
    df['newbalanceDest'] = df['newbalanceDest'].astype('float32')
    df['type'] = df['type'].astype('category')

    df.drop(columns=['nameOrig', 'nameDest', 'step', 'oldbalanceOrg', 'oldbalanceDest'], inplace=True)

    df['transaction_type'] = LabelEncoder().fit_transform(df['type'])
    df.drop(columns=['type'], inplace=True)
    df.drop(columns=['isFlaggedFraud'], inplace=True)

    scaler = RobustScaler()
    scaled_cols = ['amount', 'newbalanceOrig', 'newbalanceDest']
    df[scaled_cols] = scaler.fit_transform(df[scaled_cols])

    print("✅ Preprocessing complete.")
    return df

# === Step 3: Generate EDA ===
def generate_eda(df: pd.DataFrame):
    print("📊 Generating EDA plots...")
    os.makedirs("eda_outputs", exist_ok=True)

    sns.countplot(x='isFraud', data=df)
    plt.title("Fraud vs Non-Fraud Distribution")
    plt.savefig("eda_outputs/fraud_distribution.png")
    plt.close()

    corr_matrix = df.select_dtypes(include=[np.number]).corr()
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
    plt.title("Feature Correlation")
    plt.tight_layout()
    plt.savefig("eda_outputs/correlation_matrix.png")
    plt.close()

    print("✅ EDA plots saved to 'eda_outputs/'")

# === Step 4: Train Model and Save ===
def train_and_save_model(df: pd.DataFrame) -> str:
    print("🤖 Training model...")

    X = df.drop(columns=['isFraud'])
    y = df['isFraud']

    if y.value_counts().min() < 10:
        raise ValueError("❌ Not enough positive samples. Training aborted.")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    model = RandomForestClassifier(
        n_estimators=20,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train_res, y_train_res)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("\n📈 Model Evaluation:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # === Save model and metadata ===
    version = get_version_tag()
    # model_dir = "../app/model"
    model_dir="../model_pkl"

    os.makedirs(model_dir, exist_ok=True)

    model_path = os.path.join(model_dir, f"fraud_model_{version}.pkl")
    joblib.dump(model, model_path)
    print(f"✅ Model saved to {model_path}")

    metadata = {
        "model_version": version,
        "timestamp": datetime.now().isoformat(),
        "sample_size": len(df),
        "class_distribution": dict(y.value_counts()),
        "accuracy": round(acc, 4),
        "f1_score": round(f1, 4)
    }

    meta_path = os.path.join(model_dir, f"model_metadata_{version}.json")
    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2, default=convert_np)
    print(f"📝 Metadata saved to {meta_path}")

    return model_path

# === Main Execution ===
if __name__ == "__main__":
    print("🚀 Starting training pipeline...")

    df = load_data()
    generate_eda(df)
    df_processed = preprocess_data(df)
    latest_model_path = train_and_save_model(df_processed)

    # === Copy latest model to fixed path for API ===
    print("📦 Updating API model path...")
    api_model_path= "../app/model/fraud_model.pkl"
    shutil.copyfile(latest_model_path, api_model_path)
    print(f"✅ Copied latest model to: {api_model_path}")


🚀 Starting training pipeline...
📥 Loading data...
✅ Loaded 100000 rows.
📊 Generating EDA plots...
✅ EDA plots saved to 'eda_outputs/'
🧹 Preprocessing data...
✅ Preprocessing complete.
🤖 Training model...





📈 Model Evaluation:
[[19248   729]
 [    6    17]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     19977
           1       0.02      0.74      0.04        23

    accuracy                           0.96     20000
   macro avg       0.51      0.85      0.51     20000
weighted avg       1.00      0.96      0.98     20000

✅ Model saved to ../model_pkl\fraud_model_v20250811_1128.pkl
📝 Metadata saved to ../model_pkl\model_metadata_v20250811_1128.json
📦 Updating API model path...
✅ Copied latest model to: ../app/model/fraud_model.pkl
