# 02 — Cleaning, Alignment & Feature Engineering
This notebook loads the raw price CSVs in `data/raw/` plus the Fear & Greed Index, aligns everything to the shared date window, and engineers per-asset features (returns, moving averages, rolling volatility).

Output: `data/processed/merged_clean.csv` (the dashboard reads this file).

### Load raw inputs
Read all `*_prices.csv` files written by Notebook 01 and the Fear & Greed CSV. The loader supports both the older export format and the current clean format.

In [None]:
import pandas as pd
from pathlib import Path

RAW_DIR = Path("../data/raw")

def load_price_csv(path: str) -> pd.DataFrame:
    # New format (recommended): Date + yfinance columns
    preview = pd.read_csv(path, nrows=5)
    if len(preview.columns) > 0 and str(preview.columns[0]).strip().lower() == "price":
        # Old format: first 3 rows are header/ticker/date artifacts
        df = pd.read_csv(
            path,
            skiprows=3,
            header=None,
            names=["Date", "Close", "High", "Low", "Open", "Volume"],
        )
    else:
        df = pd.read_csv(path)
        if "Date" not in df.columns:
            raise ValueError(f"Expected a 'Date' column in {path}. Got columns: {list(df.columns)}")
        # Keep only the columns we care about (some exports include 'Adj Close')
        keep = [c for c in ["Date", "Open", "High", "Low", "Close", "Volume"] if c in df.columns]
        df = df[keep]

    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df = df.dropna(subset=["Date"]).set_index("Date").sort_index()
    df.index = df.index.normalize()

    # Ensure numerics
    for col in ["Open", "High", "Low", "Close", "Volume"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    return df

def load_fear_greed_csv(path: str) -> pd.DataFrame:
    fg = pd.read_csv(path, parse_dates=["timestamp"])
    fg = fg.rename(columns={"timestamp": "Date"}).set_index("Date").sort_index()
    fg.index = fg.index.normalize()
    fg["FG_Value"] = pd.to_numeric(fg["value"], errors="coerce")
    if "value_classification" not in fg.columns:
        fg["value_classification"] = fg["FG_Value"].apply(
            lambda x: "Extreme Fear" if x < 25 else ("Fear" if x < 45 else ("Neutral" if x < 55 else ("Greed" if x < 75 else "Extreme Greed")))
        )
    return fg

# Load all *_prices.csv saved by 01_data_collection.ipynb
price_files = sorted(RAW_DIR.glob("*_prices.csv"))
if len(price_files) == 0:
    raise FileNotFoundError(f"No *_prices.csv files found in {RAW_DIR.resolve()}. Run 01_data_collection.ipynb first.")

prices = {}
for path in price_files:
    ticker = path.stem.replace("_prices", "").upper()
    prices[ticker] = load_price_csv(str(path))

fg = load_fear_greed_csv(str(RAW_DIR / "fear_greed_index.csv"))

print("Loaded assets:", sorted(prices.keys()))
for t, df_ in prices.items():
    print(f"  {t}: {df_.shape}  ({df_.index.min().date()} → {df_.index.max().date()})")
print(f"  F&G: {fg.shape}  ({fg.index.min().date()} → {fg.index.max().date()})")

### Align to a common date window
Find the overlapping range across all selected assets and the Fear & Greed series so comparisons are apples-to-apples.

In [None]:
mins = [df_.index.min() for df_ in prices.values()] + [fg.index.min()]
maxs = [df_.index.max() for df_ in prices.values()] + [fg.index.max()]

common_start = max(mins)
common_end = min(maxs)

for t in list(prices.keys()):
    prices[t] = prices[t].loc[common_start:common_end]
fg = fg.loc[common_start:common_end]

print("Aligned date window:", common_start, "→", common_end)
print("Aligned shapes:")
for t, df_ in prices.items():
    print(f"  {t}: {df_.shape}")
print(f"  F&G: {fg.shape}")

# Show a sample
first_ticker = sorted(prices.keys())[0]
prices[first_ticker].head()

### Clean duplicates and missing values
Normalize timestamps, remove duplicates, forward-fill small gaps, and ensure numeric columns are ready for feature engineering.

In [None]:
# Drop duplicates (keep first occurrence)
for t in list(prices.keys()):
    prices[t] = prices[t][~prices[t].index.duplicated(keep="first")]
fg = fg[~fg.index.duplicated(keep="first")]

# Forward fill missing values (common for daily financial series)
for t in list(prices.keys()):
    prices[t] = prices[t].ffill()
fg = fg.ffill()

# Final numeric enforcement
for t in list(prices.keys()):
    for col in ["Open", "High", "Low", "Close", "Volume"]:
        if col in prices[t].columns:
            prices[t][col] = pd.to_numeric(prices[t][col], errors="coerce")
fg["FG_Value"] = pd.to_numeric(fg["FG_Value"], errors="coerce")

# Ensure classification exists
if "value_classification" not in fg.columns:
    fg["value_classification"] = fg["FG_Value"].apply(
        lambda x: "Extreme Fear" if x < 25 else ("Fear" if x < 45 else ("Neutral" if x < 55 else ("Greed" if x < 75 else "Extreme Greed")))
    )

print("Duplicates removed and types fixed.")

### Feature engineering (per asset)
For each asset, compute daily returns, 7/30-day moving averages, and 30-day rolling volatility.

In [None]:
for t in sorted(prices.keys()):
    df_ = prices[t]

    # Daily Return
    df_[f"{t}_Return"] = df_["Close"].pct_change()

    # Moving Averages
    df_[f"{t}_MA7"]  = df_["Close"].rolling(7).mean()
    df_[f"{t}_MA30"] = df_["Close"].rolling(30).mean()

    # Rolling Volatility (30 days)
    df_[f"{t}_Vol30"] = df_[f"{t}_Return"].rolling(30).std()

    prices[t] = df_
    print(f"{t} features created.")

### Merge assets + sentiment
Inner-join sentiment with each asset’s engineered features on date to produce one tidy analysis table.

In [None]:
# Start from sentiment and inner-join each asset's engineered features
merged = fg[["FG_Value", "value_classification"]].copy()

for t in sorted(prices.keys()):
    df_ = prices[t]
    subset = df_[["Close", "Volume", f"{t}_Return", f"{t}_MA7", f"{t}_MA30", f"{t}_Vol30"]].copy()
    subset = subset.rename(columns={"Close": f"Close_{t}", "Volume": f"Volume_{t}"})
    merged = merged.join(subset, how="inner")

print(f"Merged shape: {merged.shape}")
merged.head()

### Save processed dataset
Drop the initial rows lost to rolling windows (e.g., the first ~30 days) and write the final dataset to `data/processed/merged_clean.csv`.

In [None]:
# Drop NaN rows created by the rolling windows (first ~30 days)
merged_clean = merged.dropna()

# Save to processed folder
merged_clean.to_csv("../data/processed/merged_clean.csv")

print(f"Final cleaned data saved. Shape: {merged_clean.shape}")
print("Columns:", list(merged_clean.columns))
merged_clean.head()