# 03 — Exploratory Data Analysis (EDA)
This notebook explores the processed dataset from Notebook 02 (`data/processed/merged_clean.csv`). It’s written to adapt to whatever assets are present in the file (BTC/ETH/BNB/etc.).

We focus on:
- Price trends and context
- Fear & Greed distribution and regimes
- Sentiment vs returns and volatility
- Simple lead/lag checks

### Load the dataset and set plot defaults
Read `merged_clean.csv`, detect which tickers are available, and set a consistent plotting style for the notebook.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

DATA_PATH = "../data/processed/merged_clean.csv"

def load_df(path: str = DATA_PATH) -> pd.DataFrame:
    return pd.read_csv(path, index_col=0, parse_dates=True)

def ensure_df() -> pd.DataFrame:
    global df
    if "df" not in globals() or df is None or getattr(df, "empty", False):
        df = load_df()
    return df

def detect_tickers(df_: pd.DataFrame) -> list[str]:
    close_cols = [c for c in df_.columns if c.startswith("Close_")]
    tickers = [c.split("_", 1)[1] for c in close_cols]
    return sorted(dict.fromkeys(tickers))

# Plot style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

df = ensure_df()
tickers = detect_tickers(df)
if len(tickers) == 0:
    raise ValueError("No assets detected (expected columns like 'Close_<TICKER>' in merged_clean.csv).")

print("Data loaded:", df.shape)
print("Detected tickers:", tickers)
df.head()

### Quick summary + sentiment distribution
Start with a quick `describe()` for a small set of columns and a histogram of the Fear & Greed Index to understand the sample.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
assets = tickers[:2]
price_cols = [f"Close_{t}" for t in assets if f"Close_{t}" in df.columns]

# Summary statistics for prices and sentiment
cols = price_cols + ["FG_Value"]
print("Summary columns:", cols)
print(df[cols].describe())

# Distribution of sentiment
plt.figure(figsize=(10, 5))
sns.histplot(df["FG_Value"], bins=20, kde=True, color="orange")
plt.title("Distribution of Fear & Greed Index")
plt.xlabel("Index Value (0-100)")
plt.show()

### Price vs sentiment over time
Plot one asset’s price (the first detected ticker) alongside the Fear & Greed Index to see how they move through the same period.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
if len(tickers) == 0:
    raise ValueError("No assets detected (expected columns like 'Close_<TICKER>').")
asset = tickers[0]
price_col = f"Close_{asset}"

# Plot Asset Price vs Fear & Greed Index
fig, ax1 = plt.subplots(figsize=(14, 7))

# Plot price on left axis
color = "tab:blue"
ax1.set_xlabel("Date")
ax1.set_ylabel(f"{asset} Price (USD)", color=color)
ax1.plot(df.index, df[price_col], color=color, label=f"{asset} Price")
ax1.tick_params(axis="y", labelcolor=color)

# Create a second y-axis for sentiment
ax2 = ax1.twinx()
color = "tab:orange"
ax2.set_ylabel("Fear & Greed Index", color=color)
ax2.plot(df.index, df["FG_Value"], color=color, alpha=0.6, label="Sentiment")
ax2.tick_params(axis="y", labelcolor=color)

# Highlight fear/greed zones
ax2.axhline(25, color="red", linestyle="--", alpha=0.3, label="Extreme Fear (<25)")
ax2.axhline(75, color="green", linestyle="--", alpha=0.3, label="Extreme Greed (>75)")

plt.title(f"{asset} Price vs Market Sentiment Over Time")
fig.tight_layout()
plt.show()

### Correlations: returns, volatility, sentiment
Build a small correlation matrix (returns/volatility + Fear & Greed) to see what tends to move together in this sample.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
assets = tickers[:2]

# Select numeric columns for correlation (per-asset returns/vol + sentiment)
cols = []
for t in assets:
    for candidate in [f"{t}_Return", f"{t}_Vol30"]:
        if candidate in df.columns:
            cols.append(candidate)
cols.append("FG_Value")

corr_matrix = df[cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt=".2f")
asset_label = ", ".join(assets) if len(assets) else "Assets"
plt.title(f"Correlation Matrix: Returns, Volatility & Sentiment ({asset_label})")
plt.show()

### Sentiment vs returns (scatter)
A quick scatter + trendline to see whether days with higher sentiment tend to have higher daily returns (uses the first detected asset).

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
if len(tickers) == 0:
    raise ValueError("No assets detected (expected columns like '<TICKER>_Return').")
asset = tickers[0]
ret_col = f"{asset}_Return"

plt.figure(figsize=(10, 6))
sns.regplot(
    x="FG_Value",
    y=ret_col,
    data=df,
    scatter_kws={"alpha": 0.5},
    line_kws={"color": "red"},
)
plt.title(f"Does Sentiment Drive Returns? ({asset})")
plt.xlabel("Fear & Greed Index")
plt.ylabel(f"Daily {asset} Return")
plt.show()

### Returns by sentiment regime (boxplot)
Compare daily return distributions across sentiment buckets (Extreme Fear → Extreme Greed) for the first detected asset.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
if len(tickers) == 0:
    raise ValueError("No assets detected (expected columns like '<TICKER>_Return').")
asset = tickers[0]
ret_col = f"{asset}_Return"

plt.figure(figsize=(8, 5))
sns.boxplot(
    x="value_classification",
    y=ret_col,
    data=df,
    order=["Extreme Fear", "Fear", "Neutral", "Greed", "Extreme Greed"],
)
plt.title(f"{asset} Daily Returns by Sentiment Regime")
plt.xlabel("Sentiment Regime")
plt.ylabel(f"{asset} Daily Return")
plt.xticks(rotation=15)
plt.show()

### Rolling volatility vs sentiment
Overlay rolling volatility with Fear & Greed to see whether volatility spikes tend to cluster around fear/greed shifts.

In [None]:
df = ensure_df().copy()
tickers = detect_tickers(df)
if len(tickers) == 0:
    raise ValueError("No assets detected (expected columns like '<TICKER>_Return').")
asset = tickers[0]
ret_col = f"{asset}_Return"

df[f"{asset}_RollVol30"] = df[ret_col].rolling(window=30).std()

ax1 = df[f"{asset}_RollVol30"].plot(figsize=(12, 6), label=f"{asset} 30D Volatility", color="purple")
ax2 = df["FG_Value"].plot(secondary_y=True, label="Fear & Greed", color="orange", alpha=0.6)

ax1.set_ylabel("30D Volatility")
ax2.set_ylabel("Fear & Greed Index")
ax1.set_title(f"{asset} Rolling Volatility vs Market Sentiment")
ax1.legend(loc="upper left")
ax2.legend(loc="upper right")
plt.show()

## Lead/lag checks
A quick look at how sentiment correlates with returns across a range of lags (negative = returns lead, positive = sentiment leads).

### Lagged sentiment features
Create 1-day and 3-day lag versions of the Fear & Greed Index for quick correlation checks.

In [None]:
df = ensure_df().copy()

df["FG_lag1"] = df["FG_Value"].shift(1)
df["FG_lag3"] = df["FG_Value"].shift(3)

# Drop initial NaNs created by lagging
df_lag = df.dropna(subset=["FG_lag1", "FG_lag3"])
df_lag[["FG_Value", "FG_lag1", "FG_lag3"]].head()

### Lagged correlation (sentiment vs returns)
Compute sentiment/return correlations across a window of leads and lags to see where the relationship is strongest.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
if len(tickers) == 0:
    raise ValueError("No assets detected (expected columns like '<TICKER>_Return').")
asset = tickers[0]
ret_col = f"{asset}_Return"

lags = range(-30, 31)  # 30 days lead/lag
lag_corr = [df["FG_Value"].corr(df[ret_col].shift(lag)) for lag in lags]

plt.figure(figsize=(12, 5))
plt.plot(lags, lag_corr, marker="o")
plt.axvline(0, color="black", linestyle="--")
plt.title(f"Lagged Correlation: Sentiment vs {asset} Returns")
plt.xlabel("Lag (days) — positive means sentiment leads")
plt.ylabel("Correlation")
plt.grid(True)
plt.show()

### Best lag (quick readout)
Print the lag with the strongest correlation (for the selected asset) as a simple summary.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
if len(tickers) == 0:
    raise ValueError("No assets detected (expected columns like '<TICKER>_Return').")
asset = tickers[0]
ret_col = f"{asset}_Return"

lag_corr_series = pd.Series(lag_corr, index=lags)
best_lag = lag_corr_series.idxmax()
best_corr = lag_corr_series.max()

print(f"Best lag for {asset}: {best_lag} days")
print(f"Correlation at best lag: {best_corr:.3f}")

### Cross-asset check (if multiple assets exist)
If multiple assets are present, compare return correlations to see whether they move together over this period.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
assets = tickers[:2]

if len(assets) < 2:
    print("Only one asset detected; skipping cross-asset return correlation.")
else:
    a, b = assets
    corr = df[f"{a}_Return"].corr(df[f"{b}_Return"])
    print(f"Return Correlation ({a} vs {b}): {corr:.3f}")

### Sentiment vs returns (multi-asset view, if available)
If the dataset contains more than one asset, compare how Fear & Greed relates to returns across the first two detected tickers.

In [None]:
df = ensure_df()
tickers = detect_tickers(df)
assets = tickers[:2]

corr_targets = [f"{t}_Return" for t in assets if f"{t}_Return" in df.columns]
corr = df[["FG_Value"] + corr_targets].corr()

plt.figure(figsize=(7, 5))
sns.heatmap(corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt=".2f")
asset_label = ", ".join(assets) if len(assets) else "Assets"
plt.title(f"Correlation: Sentiment vs Returns ({asset_label})")
plt.show()

### Notes on interpretation
- Correlations can change a lot by market regime and time period.
- A peak at a positive lag suggests sentiment may lead returns in this sample; a peak near zero/negative suggests it’s mostly contemporaneous or reactive.
- Cross-asset return correlation helps separate market-wide moves from coin-specific behavior.

## Final summary chart
A compact heatmap comparing sentiment vs returns correlations (current and lagged) for the detected assets.

In [None]:
df = ensure_df().copy()
tickers = detect_tickers(df)
assets = tickers[:2]

df["FG_lag1"] = df["FG_Value"].shift(1)
df["FG_lag3"] = df["FG_Value"].shift(3)

corr_targets = [f"{t}_Return" for t in assets if f"{t}_Return" in df.columns]
sent_cols = ["FG_Value", "FG_lag1", "FG_lag3"]

if len(corr_targets) == 0:
    raise ValueError("No return columns found (expected columns like '<TICKER>_Return').")

corr_table = df[corr_targets + sent_cols].corr().loc[corr_targets, sent_cols]

plt.figure(figsize=(7, 2.6))
sns.heatmap(
    corr_table,
    annot=True,
    fmt=".3f",
    cmap="coolwarm",
    center=0,
    cbar=True,
    linewidths=0.5,
    linecolor="white",
)
asset_label = ", ".join(assets) if len(assets) else "Assets"
plt.title(f"Sentiment vs Returns (Current and Lagged) — {asset_label}")
plt.ylabel("")
plt.xlabel("")
plt.tight_layout()
plt.show()