# 01 — EDA & Risk Framing

Risk-first EDA (no Kaggle fluff):
- quantify class imbalance
- compare amount distributions
- inspect time dynamics


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

DATA_PATH = '../data/creditcard.csv'
df = pd.read_csv(DATA_PATH)
cols = list(df.columns)
label_col = 'Class' if 'Class' in cols else 'class'
time_col  = 'Time'  if 'Time'  in cols else 'time'
amount_col= 'Amount' if 'Amount' in cols else 'amount'

df[label_col] = pd.to_numeric(df[label_col], errors='coerce').fillna(0).astype(int)
print('Shape:', df.shape)
print('Label:', label_col, 'Time:', time_col, 'Amount:', amount_col)
df.head()

In [None]:
class_counts = df[label_col].value_counts().sort_index()
fraud = int(class_counts.get(1, 0))
nonfraud = int(class_counts.get(0, 0))
rate = fraud / (fraud + nonfraud)
print('Counts:', class_counts.to_dict())
print(f'Fraud rate: {rate:.4%}')

plt.figure()
plt.bar(['Non-fraud (0)', 'Fraud (1)'], [nonfraud, fraud])
plt.title('Class Imbalance')
plt.ylabel('Count')
plt.show()

In [None]:
fraud_df = df[df[label_col] == 1]
nonfraud_df = df[df[label_col] == 0]

plt.figure()
plt.hist(nonfraud_df[amount_col].values, bins=100)
plt.title('Non-fraud Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

plt.figure()
plt.hist(fraud_df[amount_col].values, bins=100)
plt.title('Fraud Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
bucket_seconds = 60 * 30
df['time_bucket'] = (df[time_col] // bucket_seconds).astype(int)
fraud_by_bucket = df[df[label_col]==1].groupby('time_bucket').size()

plt.figure()
plt.plot(fraud_by_bucket.index, fraud_by_bucket.values)
plt.title('Fraud Count by 30-min Time Bucket')
plt.xlabel('Time bucket (30-min)')
plt.ylabel('Fraud count')
plt.show()

## Key takeaways
- Extreme imbalance → focus on PR-AUC and threshold tuning.
- Use probabilities, then apply a decision policy.
- `amount` supports cost/impact framing (expected loss).
