# Notebook 1: Data Ingestion & Initial Processing

**Purpose**: Load raw dataset, handle missing values, remove duplicates, perform basic sanity checks, and normalize labels.

**Outputs**:
- `cleaned_data.csv` → `data/processed/`
- `data_summary.json` → `results/`

---

In [None]:
# Imports
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths
BASE_DIR = Path('.').resolve().parent
RAW_DATA_DIR = BASE_DIR / 'data' / 'raw'
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
RESULTS_DIR = BASE_DIR / 'results'

# Create directories if they don't exist
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base Directory: {BASE_DIR}")
print(f"Raw Data Directory: {RAW_DATA_DIR}")

## 1. Load Raw Dataset

In [None]:
# Load the credit card fraud dataset
raw_file = RAW_DATA_DIR / 'creditcard.csv'

print(f"Loading data from: {raw_file}")
df_raw = pd.read_csv(raw_file)

print(f"\nDataset Shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")

In [None]:
# Basic info
print("Dataset Info:")
print("=" * 50)
df_raw.info()

In [None]:
# Statistical summary
df_raw.describe()

## 2. Handle Missing Values

In [None]:
# Check for missing values
missing_counts = df_raw.isnull().sum()
missing_pct = (missing_counts / len(df_raw)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_pct
})

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("\n✅ No missing values found!")

In [None]:
# Handle missing values (if any)
df_clean = df_raw.dropna()
rows_dropped = len(df_raw) - len(df_clean)
print(f"Rows dropped due to missing values: {rows_dropped}")

## 3. Remove Duplicates

In [None]:
# Check for duplicates
duplicates_count = df_clean.duplicated().sum()
print(f"Duplicate rows found: {duplicates_count}")

# Remove duplicates
df_clean = df_clean.drop_duplicates()
print(f"Shape after removing duplicates: {df_clean.shape}")

## 4. Basic Sanity Checks

In [None]:
# Target column analysis
TARGET_COLUMN = 'Class'

print("Target Column Distribution (Full Dataset):")
print("=" * 50)
target_counts = df_clean[TARGET_COLUMN].value_counts()
target_pct = df_clean[TARGET_COLUMN].value_counts(normalize=True) * 100

print(f"Normal (0): {target_counts[0]:,} ({target_pct[0]:.4f}%)")
print(f"Fraud (1):  {target_counts[1]:,} ({target_pct[1]:.4f}%)")
print(f"\nClass Imbalance Ratio: 1:{target_counts[0]/target_counts[1]:.1f}")

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar plot
colors = ['#2ecc71', '#e74c3c']
ax1 = axes[0]
ax1.bar(['Normal', 'Fraud'], [target_counts[0], target_counts[1]], color=colors)
ax1.set_ylabel('Count')
ax1.set_title('Class Distribution (Full Dataset)')
ax1.set_yscale('log')

# Pie chart
ax2 = axes[1]
ax2.pie([target_counts[0], target_counts[1]], labels=['Normal', 'Fraud'], 
        autopct='%1.2f%%', colors=colors, explode=[0, 0.1])
ax2.set_title('Class Distribution (%)')

plt.tight_layout()
plt.savefig(RESULTS_DIR.parent / 'figures' / 'class_distribution_raw.png', dpi=150)
plt.show()

In [None]:
# Data type checks
print("Data Types:")
print(df_clean.dtypes)

## 5. Stratified Subsampling (2000 samples with ~5% anomalies)

In [None]:
# Configuration
MAX_SAMPLES = 2000
TARGET_ANOMALY_RATIO = 0.05  # 5% anomalies

# Calculate sample sizes
n_anomalies = int(MAX_SAMPLES * TARGET_ANOMALY_RATIO)  # 100 anomalies
n_normal = MAX_SAMPLES - n_anomalies  # 1900 normal

print(f"Target samples: {MAX_SAMPLES}")
print(f"Normal samples: {n_normal}")
print(f"Anomaly samples: {n_anomalies}")
print(f"Target anomaly ratio: {TARGET_ANOMALY_RATIO*100}%")

In [None]:
# Separate classes
df_normal = df_clean[df_clean[TARGET_COLUMN] == 0]
df_fraud = df_clean[df_clean[TARGET_COLUMN] == 1]

print(f"Available normal samples: {len(df_normal)}")
print(f"Available fraud samples: {len(df_fraud)}")

In [None]:
# Stratified sampling
np.random.seed(RANDOM_SEED)

# Sample normal class
df_normal_sampled = df_normal.sample(n=n_normal, random_state=RANDOM_SEED)

# Sample fraud class (use all if less than required, else sample)
if len(df_fraud) <= n_anomalies:
    df_fraud_sampled = df_fraud.copy()
    print(f"⚠️ Using all {len(df_fraud)} fraud samples (less than target {n_anomalies})")
else:
    df_fraud_sampled = df_fraud.sample(n=n_anomalies, random_state=RANDOM_SEED)

# Combine
df_subsampled = pd.concat([df_normal_sampled, df_fraud_sampled], ignore_index=True)

# Shuffle
df_subsampled = df_subsampled.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"\nSubsampled dataset shape: {df_subsampled.shape}")

In [None]:
# Verify subsample distribution
print("Subsampled Dataset Distribution:")
print("=" * 50)
sub_counts = df_subsampled[TARGET_COLUMN].value_counts()
sub_pct = df_subsampled[TARGET_COLUMN].value_counts(normalize=True) * 100

print(f"Normal (0): {sub_counts[0]:,} ({sub_pct[0]:.2f}%)")
print(f"Fraud (1):  {sub_counts[1]:,} ({sub_pct[1]:.2f}%)")

## 6. Label Normalization

In [None]:
# Ensure labels are 0 and 1
assert set(df_subsampled[TARGET_COLUMN].unique()) == {0, 1}, "Labels should be 0 and 1"
print("✅ Labels are already normalized (0 = Normal, 1 = Anomaly)")

## 7. Save Outputs

In [None]:
# Save cleaned data
output_path = PROCESSED_DIR / 'cleaned_data.csv'
df_subsampled.to_csv(output_path, index=False)
print(f"✅ Saved cleaned data to: {output_path}")

In [None]:
# Create and save data summary
# Convert numpy types to native Python types for JSON serialization
data_summary = {
    "source_file": str(raw_file),
    "original_shape": list(df_raw.shape),
    "cleaned_shape": list(df_subsampled.shape),
    "random_seed": int(RANDOM_SEED),
    "target_column": TARGET_COLUMN,
    "class_distribution": {
        "normal_count": int(sub_counts[0]),
        "anomaly_count": int(sub_counts[1]),
        "normal_percentage": float(round(sub_pct[0], 2)),
        "anomaly_percentage": float(round(sub_pct[1], 2))
    },
    "features": {
        "total": int(len(df_subsampled.columns) - 1),
        "names": [col for col in df_subsampled.columns if col != TARGET_COLUMN]
    },
    "missing_values_dropped": int(rows_dropped),
    "duplicates_dropped": int(duplicates_count),
    "preprocessing_steps": [
        "Loaded raw creditcard.csv",
        "Dropped missing values",
        "Removed duplicates",
        f"Stratified subsampling to {MAX_SAMPLES} samples",
        f"Target anomaly ratio: {TARGET_ANOMALY_RATIO*100}%"
    ]
}

summary_path = RESULTS_DIR / 'data_summary.json'
with open(summary_path, 'w') as f:
    json.dump(data_summary, f, indent=2)

print(f"✅ Saved data summary to: {summary_path}")

In [None]:
# Display summary
print("\nData Summary:")
print("=" * 50)
print(json.dumps(data_summary, indent=2))

## 8. Verification

In [None]:
# Verify saved file
df_verify = pd.read_csv(PROCESSED_DIR / 'cleaned_data.csv')
print(f"Verification - Loaded shape: {df_verify.shape}")
print(f"Verification - Class distribution: {df_verify[TARGET_COLUMN].value_counts().to_dict()}")
print("\n✅ Notebook 1 Complete!")