# Credit Card Fraud Detection Project – Full Notebook
This notebook covers loading, preprocessing, model training (Logistic Regression, Decision Tree, Random Forest, XGBoost, HistGradientBoosting), evaluation, and next-step snippets for hyperparameter tuning, threshold selection, and deployment.

In [None]:
# Optional: install/upgrade xgboost
# !pip install --upgrade xgboost --quiet

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
print('Imports loaded')

In [None]:
df = pd.read_csv('creditcard.csv')
print('Data shape:', df.shape)
print(df['Class'].value_counts(normalize=True))

In [None]:
# Preprocess: check missing, scale Amount, drop Time
assert df.isnull().sum().sum() == 0, 'Missing values!'
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])
df.drop(columns=['Time'], inplace=True)
print('Preprocessing done')

In [None]:
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print('Train:', X_train.shape, 'Test:', X_test.shape)

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print('Resampled train:', X_train_res.shape)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=500, n_jobs=-1, random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=50, class_weight='balanced', n_jobs=-1, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train), n_jobs=-1, use_label_encoder=False, eval_metric='auc', random_state=42),
    'HistGradientBoosting': HistGradientBoostingClassifier(max_iter=100, early_stopping=True, random_state=42)
}

In [None]:
results = {}
for name, model in models.items():
    print(f"\n=== Training {name} ===")
    model.fit(X_train_res, y_train_res)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]
    report = classification_report(y_test, preds, output_dict=True)
    auc = roc_auc_score(y_test, proba)
    results[name] = {
        'precision_fraud': report['1']['precision'],
        'recall_fraud': report['1']['recall'],
        'f1_fraud': report['1']['f1-score'],
        'roc_auc': auc
    }
    print(f"{name} ROC AUC: {auc:.4f}")

In [None]:
import pandas as pd
summary_df = pd.DataFrame(results).T
print('\n=== Model Comparison ===')
print(summary_df.sort_values('roc_auc', ascending=False))

## Hyperparameter Tuning Example
Use RandomizedSearchCV to optimize your Random Forest:
```python
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)
search = RandomizedSearchCV(rf, param_dist, n_iter=20, cv=3, scoring='f1', n_jobs=-1)
search.fit(X_train_res, y_train_res)
print(search.best_params_)
```

## Precision-Recall Threshold Selection
Adjust probability cutoff:
```python
from sklearn.metrics import precision_recall_curve
probs = models['Random Forest'].predict_proba(X_test)[:,1]
prec, rec, thresh = precision_recall_curve(y_test, probs)
# Choose threshold based on desired trade-off
```

## Deployment with Streamlit
```bash
streamlit run app.py
```
Or wrap in Flask/FastAPI for an API endpoint.