# Fraud Detection and Customer Clustering

This notebook trains classifiers and performs clustering using `creditcard.csv`.

In [None]:
import pandas as pd

# Load the dataset (not committed to GitHub)
df = pd.read_csv('../data/creditcard.csv')
df.info()
df['Class'].value_counts(normalize=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['Class'])
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, stratify=y, random_state=42
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

print("Logistic Regression:")
print(classification_report(y_test, lr.predict(X_test)))

print("Random Forest:")
print(classification_report(y_test, rf.predict(X_test)))

In [None]:
from sklearn.metrics import confusion_matrix

# Predict probabilities
y_probs = rf.predict_proba(X_test)[:, 1]

# Set a custom threshold
threshold = 0.3
y_pred_thresh = (y_probs > threshold).astype(int)

# Show confusion matrix
cm = confusion_matrix(y_test, y_pred_thresh)
print("Confusion Matrix at threshold =", threshold)
print(cm)

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

prec, rec, thresh = precision_recall_curve(y_test, y_probs)

plt.plot(thresh, prec[:-1], label='Precision')
plt.plot(thresh, rec[:-1], label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision-Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import precision_score, recall_score

mlflow.set_experiment("Fraud Detection")

with mlflow.start_run():
    mlflow.log_param("model", "RandomForest")
    mlflow.log_param("threshold", threshold)

    mlflow.log_metric("precision", precision_score(y_test, y_pred_thresh))
    mlflow.log_metric("recall", recall_score(y_test, y_pred_thresh))

    mlflow.sklearn.log_model(rf, "model")
    print("Model logged in MLflow.")

In [None]:
import shap

explainer = shap.Explainer(rf, X_test)
shap_values = explainer(X_test[:100])
shap.plots.beeswarm(shap_values)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=3, random_state=42).fit(X_scaled)
labels = kmeans.labels_
print("Silhouette Score:", silhouette_score(X_scaled, labels))

In [None]:
import joblib
joblib.dump(rf, '../models/fraud_model.pkl')