<a href="https://colab.research.google.com/github/Vivek-Kasturi/credit-card-fraud-detection/blob/colab-pipeline/fraud_detection_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Full pipeline: classification + detection of fraud times + 1-year forecast
# Run in Colab or local Python (requirements below)
# Requirements: pandas, numpy, scikit-learn, matplotlib, seaborn, statsmodels (optional), pmdarima (optional)
# (In Colab: pip install pmdarima if you want auto ARIMA. statsmodels often preinstalled.)

import os, warnings, math
warnings.filterwarnings('ignore')
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, classification_report
import matplotlib.pyplot as plt, seaborn as sns
sns.set(style='whitegrid')

# Params
CSV_PATH = '/content/creditcard.csv'  # update path if not in same folder
OUTPUT_DIR = '/content/outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)
ANCHOR_DATE = pd.to_datetime('2013-01-01')   # arbitrary anchor date for Time -> datetime
FORECAST_DAYS = 365

# 1) Load & quick checks
df = pd.read_csv(CSV_PATH)
print("Rows,cols:", df.shape)
print(df['Class'].value_counts())
print("Fraud ratio:", df['Class'].mean())

# 2) Preprocessing: scale Time and Amount
X = df.drop('Class', axis=1).copy()
y = df['Class'].copy()
scaler = StandardScaler()
X[['Time','Amount']] = scaler.fit_transform(X[['Time','Amount']])

# 3) Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

# 4) Fit a solid, fast baseline: LogisticRegression with class_weight
clf = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear')
clf.fit(X_train, y_train)

# 5) Evaluate on test
proba_test = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, proba_test)
aupr = average_precision_score(y_test, proba_test)
print("Logistic AUC:", round(auc,4), "AUPR:", round(aupr,4))
print("\nClassification report (threshold=0.5):")
print(classification_report(y_test, (proba_test>=0.5).astype(int), digits=4))

# Save PR curve
precision, recall, thresh = precision_recall_curve(y_test, proba_test)
plt.figure(figsize=(6,4)); plt.plot(recall, precision, lw=2); plt.xlabel('Recall'); plt.ylabel('Precision')
plt.title('Precision-Recall curve (Logistic)'); plt.grid(True)
plt.savefig(os.path.join(OUTPUT_DIR,'pr_curve_logistic.png'), bbox_inches='tight'); plt.close()

# 6) Fit on full dataset for scoring every transaction
clf_full = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear')
clf_full.fit(X, y)
df['pred_proba'] = clf_full.predict_proba(X)[:,1]

# 7) Convert Time to datetime (relative) & aggregate by day
df['datetime'] = ANCHOR_DATE + pd.to_timedelta(df['Time'], unit='s')
df['date'] = pd.to_datetime(df['datetime'].dt.date)
daily = df.groupby('date').agg(
    total_transactions = ('Class','count'),
    actual_frauds = ('Class','sum'),
    expected_frauds = ('pred_proba','sum'),   # sum of predicted probabilities ~ expected number of frauds
    mean_pred_prob = ('pred_proba','mean')
).reset_index().sort_values('date')
daily['fraud_rate'] = daily['actual_frauds'] / daily['total_transactions']

# Save daily CSV and plot
daily.to_csv(os.path.join(OUTPUT_DIR,'daily_agg.csv'), index=False)
plt.figure(figsize=(12,4))
plt.plot(daily['date'], daily['actual_frauds'], label='actual frauds (daily)', alpha=0.8)
plt.plot(daily['date'], daily['expected_frauds'], label='expected_frauds (sum probs)', alpha=0.8)
plt.legend(); plt.title('Daily frauds: actual vs expected'); plt.xlabel('date'); plt.ylabel('count')
plt.savefig(os.path.join(OUTPUT_DIR,'daily_frauds.png'), bbox_inches='tight'); plt.close()

# 8) Detect suspicious time windows (simple statistical rule + rolling)
# flags: daily frauds above mean + 3*std OR days where fraud_rate is in top 1%
mean_f = daily['actual_frauds'].mean(); std_f = daily['actual_frauds'].std()
daily['anomaly_mean3std'] = daily['actual_frauds'] > (mean_f + 3*std_f)
p99 = daily['fraud_rate'].quantile(0.99)
daily['anomaly_p99_rate'] = daily['fraud_rate'] >= p99
# rolling-window burst detection (7-day rolling sum)
daily['frauds_7d'] = daily['actual_frauds'].rolling(7, min_periods=1).sum()
thr_7d = daily['frauds_7d'].mean() + 3*daily['frauds_7d'].std()
daily['anomaly_7d_burst'] = daily['frauds_7d'] > thr_7d

daily[['date','total_transactions','actual_frauds','expected_frauds','anomaly_mean3std','anomaly_p99_rate','anomaly_7d_burst']].to_csv(os.path.join(OUTPUT_DIR,'daily_anomaly_flags.csv'), index=False)

# 9) Forecasting daily fraud counts (two options)
# Option A: try SARIMAX / ARIMA (requires statsmodels)
forecast_df = None
try:
    import statsmodels.api as sm
    # Fit a simple SARIMAX(1,0,1) without seasonal terms for speed
    daily_idx = daily.set_index('date')['actual_frauds']
    model = sm.tsa.SARIMAX(daily_idx, order=(1,0,1), enforce_stationarity=False, enforce_invertibility=False)
    res = model.fit(disp=False)
    fc = res.get_forecast(steps=FORECAST_DAYS)
    fc_mean = fc.predicted_mean
    fc_ci = fc.conf_int(alpha=0.05)
    forecast_df = pd.DataFrame({
        'date': fc_mean.index,
        'predicted_frauds': fc_mean.values,
        'lower': fc_ci.iloc[:,0].values,
        'upper': fc_ci.iloc[:,1].values
    })
    print("Used SARIMAX forecasting (statsmodels).")
except Exception as e:
    # Option B: robust fallback — linear trend on day index + hold MA(7) baseline
    print("SARIMAX not available or failed:", str(e))
    daily = daily.reset_index(drop=True)
    daily['day_idx'] = (daily['date'] - daily['date'].min()).dt.days
    # Linear regression on actual_frauds
    from sklearn.linear_model import LinearRegression
    lr_time = LinearRegression()
    lr_time.fit(daily[['day_idx']], daily['actual_frauds'])
    future_dates = pd.date_range(daily['date'].max() + pd.Timedelta(days=1), periods=FORECAST_DAYS)
    future_idx = ((future_dates - daily['date'].min()).days).astype(int)
    preds = lr_time.predict(future_idx.reshape(-1,1))
    resid_std = (daily['actual_frauds'] - lr_time.predict(daily[['day_idx']])).std()
    lower = preds - 1.96*resid_std
    upper = preds + 1.96*resid_std
    preds = np.clip(preds, 0, None); lower = np.clip(lower, 0, None)
    forecast_df = pd.DataFrame({'date': future_dates, 'predicted_frauds': preds, 'lower': lower, 'upper': upper})

# Save forecast and plot
forecast_df.to_csv(os.path.join(OUTPUT_DIR,'forecast_365d.csv'), index=False)
plt.figure(figsize=(12,5))
plt.plot(daily['date'], daily['actual_frauds'], label='actual daily frauds')
plt.plot(forecast_df['date'], forecast_df['predicted_frauds'], label='forecast next 365d', color='C1')
plt.fill_between(forecast_df['date'], forecast_df['lower'], forecast_df['upper'], color='C1', alpha=0.2)
plt.legend(); plt.title('Daily frauds and 1-year forecast'); plt.savefig(os.path.join(OUTPUT_DIR,'forecast_plot.png'), bbox_inches='tight'); plt.close()

# 10) Save scored transactions (top suspicious transactions)
df = df.sort_values('pred_proba', ascending=False)
df[['datetime','Amount','Class','pred_proba']].head(200).to_csv(os.path.join(OUTPUT_DIR,'top_200_suspects.csv'), index=False)

print("All outputs saved to", OUTPUT_DIR)
print("Top 5 days flagged as anomalies (by mean+3std):")
print(daily[daily['anomaly_mean3std']].sort_values('actual_frauds', ascending=False).head(10))

Rows,cols: (284807, 31)
Class
0    284315
1       492
Name: count, dtype: int64
Fraud ratio: 0.001727485630620034
Logistic AUC: 0.968 AUPR: 0.704

Classification report (threshold=0.5):
              precision    recall  f1-score   support

           0     0.9998    0.9788    0.9892     85295
           1     0.0669    0.8784    0.1244       148

    accuracy                         0.9786     85443
   macro avg     0.5334    0.9286    0.5568     85443
weighted avg     0.9982    0.9786    0.9877     85443



  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(


Used SARIMAX forecasting (statsmodels).
All outputs saved to /content/outputs
Top 5 days flagged as anomalies (by mean+3std):
Empty DataFrame
Columns: [date, total_transactions, actual_frauds, expected_frauds, mean_pred_prob, fraud_rate, anomaly_mean3std, anomaly_p99_rate, frauds_7d, anomaly_7d_burst]
Index: []
