In [None]:
# 01 â€“ Data Collection & Initial Cleaning
#
# In this notebook we:
# - Load sample stock price files from data/stocks_sample/
# - Load the fraud transactions dataset from data/fraud_dataset.csv
# - Apply basic cleaning using src/data_cleaning.py
# - Save intermediate cleaned datasets for later notebooks

import os
import pandas as pd

from src.data_cleaning import (
    list_stock_files,
    load_all_stocks,
    clean_stock_data,
    load_fraud_data,
    clean_fraud_data,
    save_cleaned_data,
)

# -------------------------------------------------------------------
# Paths
# -------------------------------------------------------------------
DATA_DIR = os.path.join("..", "data")
STOCK_DIR = os.path.join(DATA_DIR, "stocks_sample")

os.makedirs(DATA_DIR, exist_ok=True)

print("DATA_DIR:", DATA_DIR)
print("STOCK_DIR:", STOCK_DIR)

# -------------------------------------------------------------------
# Load Raw Stock Data
# -------------------------------------------------------------------
stock_files = list_stock_files(STOCK_DIR)
print("\nFound stock files:")
for f in stock_files:
    print(" -", os.path.basename(f))

stocks_raw = load_all_stocks(STOCK_DIR, date_col="date")
print("\nRaw stocks shape:", stocks_raw.shape)
display(stocks_raw.head())

# -------------------------------------------------------------------
# Clean Stock Data
# -------------------------------------------------------------------
stocks_clean = clean_stock_data(
    stocks_raw,
    price_cols=["open", "high", "low", "close"],
    volume_col="volume",
    min_volume=0.0,
)

print("\nCleaned stocks shape:", stocks_clean.shape)
display(stocks_clean.head())

# Save cleaned stocks
clean_stocks_path = os.path.join(DATA_DIR, "cleaned_stocks.csv")
save_cleaned_data(stocks_clean, clean_stocks_path)
print("\nSaved cleaned stocks to:", clean_stocks_path)

# -------------------------------------------------------------------
# Load Fraud Dataset
# -------------------------------------------------------------------
fraud_raw_path = os.path.join(DATA_DIR, "fraud_dataset.csv")
fraud_raw = load_fraud_data(fraud_raw_path)

print("\nRaw fraud dataset shape:", fraud_raw.shape)
display(fraud_raw.head())

# -------------------------------------------------------------------
# Clean Fraud Dataset
# -------------------------------------------------------------------
cols_to_drop = ["customer_name", "customer_email"]  # dropped only if they exist

fraud_clean = clean_fraud_data(
    fraud_raw,
    target_col="is_fraud",
    drop_cols=cols_to_drop,
)

print("\nCleaned fraud dataset shape:", fraud_clean.shape)
display(fraud_clean.head())

# Save cleaned fraud data
clean_fraud_path = os.path.join(DATA_DIR, "cleaned_fraud_dataset.csv")
save_cleaned_data(fraud_clean, clean_fraud_path)
print("\nSaved cleaned fraud dataset to:", clean_fraud_path)

print("\n=== 01_data_collection completed ===")
