# 01 - EDA: Transaction Pattern Analysis


In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.data import DataPaths, ensure_dataset, load_dataframe_from_csv

paths = DataPaths()
csv_path = ensure_dataset(paths, generate_if_missing=True, n_rows=20000, seed=42)
df = load_dataframe_from_csv(csv_path)
df.head()


## Overview


In [None]:
df.describe(include='all').T


## Class Balance


In [None]:
df['is_fraud'].value_counts(normalize=True)


## Amount Distribution and Outliers


In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(1,2, figsize=(12,4))
sns.histplot(df['amount'], bins=50, ax=ax[0], kde=True)
sns.boxplot(x=df['amount'], ax=ax[1])
plt.show()


## Fraud Rate by Category, Time, and Location


In [None]:
for col in ['merchant_category', 'time_of_day', 'location']:
    rate = df.groupby(col)['is_fraud'].mean().sort_values(ascending=False)
    plt.figure(figsize=(8,3))
    sns.barplot(x=rate.index, y=rate.values)
    plt.title(f'Fraud rate by {col}')
    plt.xticks(rotation=30)
    plt.show()


## Heatmap: Amount vs Time (Avg Fraud Probability)


In [None]:
df['amount_bin'] = pd.qcut(df['amount'], q=10, duplicates='drop')
pivot = df.pivot_table(index='amount_bin', columns='time_of_day', values='is_fraud', aggfunc='mean')
plt.figure(figsize=(8,5))
sns.heatmap(pivot, annot=True, fmt='.2f', cmap='Reds')
plt.title('Fraud rate heatmap: amount deciles x time_of_day')
plt.show()


## Save a few figures to reports/figures


In [None]:
os.makedirs('reports/figures', exist_ok=True)
plt.figure(figsize=(6,4))
sns.histplot(df['amount'], bins=50, kde=True)
plt.title('Amount distribution')
plt.tight_layout()
plt.savefig('reports/figures/amount_distribution.png', dpi=150)
plt.close()
