In [None]:
# 01_eda.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
data = pd.read_csv('data/raw/fraud_dataset.csv')

# Quick look
print(data.head())
print(data.info())
print(data.describe())

# Class distribution
print("Fraud vs Non-Fraud Counts:")
print(data['Class'].value_counts())

sns.countplot(x='Class', data=data)
plt.title('Class Distribution (0 = Normal, 1 = Fraud)')
plt.show()

# Fraud percentage
fraud_pct = data['Class'].value_counts()[1] / len(data) * 100
print(f"Fraud percentage: {fraud_pct:.4f}%")

# Missing values
print("Missing values per column:")
print(data.isnull().sum())

# Feature exploration
numeric_features = data.select_dtypes(include=[np.number]).columns.tolist()
numeric_features.remove('Class')

for col in numeric_features:
    plt.figure(figsize=(6,3))
    sns.histplot(data[col], bins=50, kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# Fraud vs Normal comparison
for col in numeric_features:
    plt.figure(figsize=(6,3))
    sns.boxplot(x='Class', y=col, data=data)
    plt.title(f'{col} vs Class')
    plt.show()

# Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()


: 