# Fraud Detection - Data Exploration

This notebook explores the fraud detection dataset and prepares it for model training.

## Objectives:
1. Load and explore the Paysim dataset
2. Analyze data distribution and characteristics  
3. Preprocess and clean the data
4. Create train/validation/test splits
5. Save processed datasets for model training

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import kagglehub

In [None]:
# =========================
# Paths
# =========================
RAW_DIR = "notebooks/data/raw/"
RAW_CSV_PATH = os.path.join(RAW_DIR, "paysim.csv")

PROCESSED_DATA_PATH = "notebook/data/processed/"
FIGURE_PATH = "notebooks/output/figures/"

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
os.makedirs(FIGURE_PATH, exist_ok=True)

In [None]:
# =========================
# Data Download
# =========================
def download_dataset_if_needed():
    """
    Downloads the PaySim dataset using KaggleHub if not already present.
    """
    if os.path.exists(RAW_CSV_PATH):
        print("PaySim dataset already exists. Skipping download.")
        return

    print("Downloading PaySim dataset from KaggleHub...")
    dataset_path = kagglehub.dataset_download("ealaxi/paysim1")

    csv_found = False
    for file in os.listdir(dataset_path):
        if file.endswith(".csv"):
            src = os.path.join(dataset_path, file)
            dst = RAW_CSV_PATH
            os.replace(src, dst)
            csv_found = True
            print(f"Dataset saved to {RAW_CSV_PATH}")
            break

    if not csv_found:
        raise FileNotFoundError("No CSV file found in downloaded PaySim dataset.")

In [None]:
# =========================
# Data Loading & Processing
# =========================
def load_data():
    download_dataset_if_needed()
    print("Loading raw PaySim data...")
    return pd.read_csv(RAW_CSV_PATH)

def clean_data(df):
    """
    Basic cleaning and label preparation.
    """
    print("Cleaning data...")
    df = df.dropna()
    df["is_fraud"] = df["isFraud"].astype(int)
    return df

def create_text_narratives(df):
    print("Creating enriched transaction narratives...")

    df["text"] = (
        "A financial transaction where account "
        + df["nameOrig"]
        + " sent "
        + df["amount"].astype(str)
        + " units to account "
        + df["nameDest"]
        + " using transaction type "
        + df["type"]
        + ". Sender balance changed from "
        + df["oldbalanceOrg"].astype(str)
        + " to "
        + df["newbalanceOrig"].astype(str)
        + ". Receiver balance changed from "
        + df["oldbalanceDest"].astype(str)
        + " to "
        + df["newbalanceDest"].astype(str)
        + "."
    )

    return df

In [None]:
# =========================
# Visualization
# =========================
def visualize_data(df):
    """
    Generates and saves class distribution plots
    (absolute count + percentage).
    """
    print("Generating visualizations...")

    # ---------- Absolute Count Plot ----------
    fraud_counts = df["is_fraud"].value_counts().sort_index()

    plt.figure(figsize=(6, 4))
    fraud_counts.plot(kind="bar")
    plt.title("Fraud vs Non-Fraud Transactions (Count)")
    plt.xlabel("Class (0 = Non-Fraud, 1 = Fraud)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURE_PATH, "fraud_distribution_counts.png"))
    plt.close()

    # ---------- Percentage Plot ----------
    fraud_percent = (
        df["is_fraud"]
        .value_counts(normalize=True)
        .sort_index() * 100
    )

    plt.figure(figsize=(6, 4))
    fraud_percent.plot(kind="bar")
    plt.title("Fraud vs Non-Fraud Transactions (Percentage)")
    plt.xlabel("Class (0 = Non-Fraud, 1 = Fraud)")
    plt.ylabel("Percentage (%)")

    # Force y-axis to be visible
    plt.ylim(0, 100)

    # Annotate bars
    for idx, value in enumerate(fraud_percent.values):
        plt.text(idx, value + 0.5, f"{value:.2f}%", ha="center")

    plt.tight_layout()
    plt.savefig(os.path.join(FIGURE_PATH, "fraud_distribution_percentage.png"))
    plt.close()

    print("Saved visualization files:")
    print(" - fraud_distribution_counts.png")
    print(" - fraud_distribution_percentage.png")

In [None]:
# =========================
# Train / Val / Test Split
# =========================
def split_and_save(df):
    """
    Splits the dataset into train, validation, and test sets
    and saves them to disk.
    """
    print("Splitting dataset...")

    train_df, temp_df = train_test_split(
        df,
        test_size=0.3,
        stratify=df["is_fraud"],
        random_state=42
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        stratify=temp_df["is_fraud"],
        random_state=42
    )

    train_df.to_csv(os.path.join(PROCESSED_DATA_PATH, "train.csv"), index=False)
    val_df.to_csv(os.path.join(PROCESSED_DATA_PATH, "val.csv"), index=False)
    test_df.to_csv(os.path.join(PROCESSED_DATA_PATH, "test.csv"), index=False)

    print("Processed data saved:")
    print(f"  Train: {len(train_df)} samples")
    print(f"  Val:   {len(val_df)} samples")
    print(f"  Test:  {len(test_df)} samples")

In [None]:
# =========================
# Main Pipeline
# =========================
def main():
    df = load_data()
    df = clean_data(df)
    df = create_text_narratives(df)
    visualize_data(df)
    split_and_save(df)

if __name__ == "__main__":
    main()
