In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    f1_score, precision_score, recall_score,
    roc_auc_score, average_precision_score,
    balanced_accuracy_score,
    accuracy_score, mean_squared_error, mean_absolute_error
)
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except:
    HAS_XGB = False

DATA_PATH = "creditcard.csv"
MODEL_PATH = "credit_fraud_model.joblib"
RANDOM_STATE = 42

def train_model():
    if not Path(DATA_PATH).exists():
        raise FileNotFoundError("Place creditcard.csv in this folder.")

    df = pd.read_csv(DATA_PATH)

    df["Class"] = pd.to_numeric(df["Class"], errors="coerce")
    df = df[df["Class"].isin([0, 1])]
    df["Class"] = df["Class"].astype(int)

    X = df.drop(columns=["Class"]).copy()
    y = df["Class"].copy()

    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors="coerce")
    X = X.replace([np.inf, -np.inf], np.nan)

    X_tv, X_test, y_tv, y_test = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE)

    X_train, X_valid, y_train, y_valid = train_test_split(
        X_tv, y_tv, test_size=0.25, stratify=y_tv, random_state=RANDOM_STATE)

    scale_cols = ["Time", "Amount"]
    scale_cols = [c for c in scale_cols if c in X.columns]
    other_cols = [c for c in X.columns if c not in scale_cols]

    preprocess = ColumnTransformer(
        transformers=[
            ("scale", Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("sc", StandardScaler())
            ]), scale_cols),
            ("other", Pipeline([
                ("imp", SimpleImputer(strategy="median"))
            ]), other_cols)
        ],
        remainder="drop"
    )

    pos_rate = y_train.mean()
    scale_pos_weight = (1 - pos_rate) / pos_rate

    if HAS_XGB:
        model = XGBClassifier(
            n_estimators=800, max_depth=4, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.9,
            scale_pos_weight=scale_pos_weight,
            objective="binary:logistic", eval_metric="logloss",
            tree_method="hist", random_state=RANDOM_STATE
        )
    else:
        model = RandomForestClassifier(
            n_estimators=800, class_weight="balanced_subsample",
            random_state=RANDOM_STATE, n_jobs=-1
        )

    pipe = Pipeline([("prep", preprocess), ("clf", model)])
    pipe.fit(X_train, y_train)

    valid_proba = pipe.predict_proba(X_valid)[:, 1]
    thresholds = np.linspace(0.05, 0.50, 50)

    best_t = 0.5
    best_f1 = -1

    for t in thresholds:
        preds = (valid_proba >= t).astype(int)
        f1 = f1_score(y_valid, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    print(f"Best threshold = {best_t:.3f}, F1 = {best_f1:.4f}")

    test_proba = pipe.predict_proba(X_test)[:, 1]
    test_preds = (test_proba >= best_t).astype(int)

    print("\n====== TEST METRICS ======")
    print("F1:", f1_score(y_test, test_preds))
    print("Precision:", precision_score(y_test, test_preds))
    print("Recall:", recall_score(y_test, test_preds))
    print("Balanced Accuracy:", balanced_accuracy_score(y_test, test_preds))
    print("ROC-AUC:", roc_auc_score(y_test, test_proba))
    print("PR-AUC:", average_precision_score(y_test, test_proba))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, test_proba)))
    print("MAE:", mean_absolute_error(y_test, test_proba))

    dump({
        "pipeline": pipe,
        "threshold": float(best_t),
        "feature_order": list(X.columns)
    }, MODEL_PATH)

    print("\nModel saved â†’", MODEL_PATH)

def predict_new(data_dict):
    bundle = load(MODEL_PATH)
    pipe = bundle["pipeline"]
    threshold = bundle["threshold"]
    features = bundle["feature_order"]

    df = pd.DataFrame([data_dict])
    df = df.reindex(columns=features)

    proba = pipe.predict_proba(df)[:, 1][0]
    pred = int(proba >= threshold)

    return {
        "fraud_probability": float(proba),
        "predicted_class": pred
    }

# Example:
# train_model()
# print(predict_new({...}))
