# 01 — Data Collection (Raw Inputs)
This notebook downloads raw market data for the cryptocurrencies you choose and pulls the Crypto Fear & Greed Index. It writes CSVs to `data/raw/` so the next notebook (`02_data_cleaning.ipynb`) can merge and engineer features.

This notebook is designed to work both ways:
- Run it directly in Jupyter/VS Code, or
- Run it headlessly from the dashboard (via papermill) with injected parameters.

### Setup & imports
Load the libraries needed to download price data and call the sentiment API.

In [None]:
import pandas as pd
import yfinance as yf
import requests
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from typing import Dict, List, Optional
import datetime

class CryptoDataCollector:
    """Collects and processes cryptocurrency price + Fear & Greed sentiment data."""

    def __init__(self, crypto_symbols: Optional[List[str]] = None):
        self.crypto_symbols = crypto_symbols or ["BTC-USD", "ETH-USD"]

    @staticmethod
    def _flatten_yf_columns(df: pd.DataFrame) -> pd.DataFrame:
        out = df.copy()
        if isinstance(out.columns, pd.MultiIndex):
            out.columns = out.columns.get_level_values(0)
        return out

    def download_price_data(self, start_date: str, end_date: str) -> Dict[str, pd.DataFrame]:
        """Download daily OHLCV from Yahoo Finance for selected symbols."""
        crypto_dfs: Dict[str, pd.DataFrame] = {}
        for symbol in self.crypto_symbols:
            df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=False, progress=False)
            df = self._flatten_yf_columns(df)
            ticker = symbol.split("-")[0]
            crypto_dfs[ticker] = df
        return crypto_dfs

    def fetch_fear_greed_index(self, start_date: Optional[str] = None, end_date: Optional[str] = None) -> pd.DataFrame:
        """Fetch Fear & Greed Index from Alternative.me and optionally filter by date."""
        url = "https://api.alternative.me/fng/?limit=0&format=json"
        resp = requests.get(url, timeout=20)
        resp.raise_for_status()
        payload = resp.json()

        fg = pd.DataFrame(payload["data"])
        fg["timestamp"] = pd.to_datetime(fg["timestamp"].astype(int), unit="s")
        fg["value"] = fg["value"].astype(int)
        fg = fg.set_index("timestamp").sort_index()

        if start_date:
            fg = fg[fg.index >= pd.to_datetime(start_date)]
        if end_date:
            fg = fg[fg.index <= pd.to_datetime(end_date)]

        return fg

## Download daily OHLCV for selected assets
Pull daily OHLCV (open, high, low, close, volume) from Yahoo Finance for the selected tickers over a shared date window.

In [None]:
# ============================================================================
# DATE RANGE + SELECTED CRYPTOS
# - If papermill injected `start_date`, `end_date`, and `selected_cryptos`, use them.
# - Otherwise (manual notebook run), pull values from the running Flask dashboard.
# ============================================================================

API_DATE_RANGE = "http://127.0.0.1:5000/api/date-range"
API_SELECTED_CRYPTOS = "http://127.0.0.1:5000/api/selected-cryptos"

def get_dates_from_dashboard():
    try:
        resp = requests.get(API_DATE_RANGE, timeout=10)
        resp.raise_for_status()
        payload = resp.json()
        start = payload.get("current_start") or payload.get("default_start") or "2015-01-01"
        end = payload.get("current_end") or payload.get("default_end") or datetime.date.today().strftime("%Y-%m-%d")
        return start, end
    except Exception as e:
        print(f"Dashboard not reachable, using fallback dates. Details: {e}")
        return "2015-01-01", datetime.date.today().strftime("%Y-%m-%d")

def get_selected_cryptos_from_dashboard():
    try:
        resp = requests.get(API_SELECTED_CRYPTOS, timeout=10)
        resp.raise_for_status()
        payload = resp.json()
        selected = payload.get("selected_cryptos") or ["BTC-USD", "ETH-USD"]
        if not isinstance(selected, list) or len(selected) == 0:
            return ["BTC-USD", "ETH-USD"]
        return selected
    except Exception as e:
        print(f"Dashboard not reachable, using fallback cryptos. Details: {e}")
        return ["BTC-USD", "ETH-USD"]

# Resolve parameters (papermill can inject these globals)
_has_dates = ("start_date" in globals() and start_date) and ("end_date" in globals() and end_date)
if not _has_dates:
    start_date, end_date = get_dates_from_dashboard()

if "selected_cryptos" not in globals() or not selected_cryptos:
    selected_cryptos = get_selected_cryptos_from_dashboard()

print(f"Using date range: {start_date} to {end_date}")
print(f"Selected assets: {selected_cryptos}")

collector = CryptoDataCollector(crypto_symbols=selected_cryptos)
crypto = collector.download_price_data(start_date, end_date)

# Convenience handles when BTC/ETH are present
btc = crypto.get("BTC")
eth = crypto.get("ETH")

# Preview
for ticker, df_ in crypto.items():
    if df_ is None or df_.empty:
        continue
    print(f"Downloaded {ticker} data from {df_.index.min().date()} to {df_.index.max().date()} ({len(df_)} rows)")

# Show first two assets as a preview table
_preview = list(crypto.values())[:2]
(_preview[0].head() if len(_preview) > 0 else None), (_preview[1].head() if len(_preview) > 1 else None)

### Save raw price CSVs
Write one raw CSV per asset into `data/raw/` (e.g., `btc_prices.csv`, `sol_prices.csv`). The cleaning notebook reads these files to build the merged dataset.

In [None]:
from pathlib import Path

RAW_DIR = Path("../data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Remove old per-asset price CSVs so 02 only processes the latest selected assets
for old in RAW_DIR.glob("*_prices.csv"):
    old.unlink()

# Save with a simple, clean header (Date + OHLCV)
def flatten_yf_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if isinstance(out.columns, pd.MultiIndex):
        out.columns = out.columns.get_level_values(0)
    return out

# Save each selected asset as <ticker>_prices.csv (e.g., btc_prices.csv, sol_prices.csv)
for ticker, df_ in crypto.items():
    if df_ is None or df_.empty:
        print(f"Skipping {ticker}: empty")
        continue
    out = flatten_yf_columns(df_).reset_index()
    path = RAW_DIR / f"{ticker.lower()}_prices.csv"
    out.to_csv(path, index=False)
    print(f"Saved {ticker} → {path} ({len(out)} rows)")

## Fetch the Crypto Fear & Greed Index
This index tracks overall crypto market sentiment on a 0–100 scale (lower = fear, higher = greed). We fetch the full daily history from the Alternative.me API.

In [None]:
# Get all available history from Alternative.me API
url = "https://api.alternative.me/fng/?limit=0&format=json"
resp = requests.get(url, timeout=20)
resp.raise_for_status()
payload = resp.json()

# Convert to DataFrame (API returns list under 'data')
fg = pd.DataFrame(payload["data"])

# Convert dtypes
fg["timestamp"] = pd.to_datetime(fg["timestamp"].astype(int), unit="s")
fg["value"] = fg["value"].astype(int)

# Set datetime index and sort
fg = fg.set_index("timestamp").sort_index()

fg.head()

### Save sentiment history
Persist the Fear & Greed history to `data/raw/fear_greed_index.csv` so downstream steps are reproducible.

In [None]:
fg.to_csv("../data/raw/fear_greed_index.csv")
fg.shape

### Optional: trimmed sentiment copy
Create a trimmed version of the Fear & Greed series that matches the overlapping price window. This is mainly for convenience/inspection; the cleaning step reads `fear_greed_index.csv`.

In [None]:
# Trim sentiment to the same shared date range as the downloaded price data
if len(crypto) == 0:
    raise ValueError("No price data was downloaded. Check selected assets and date range.")

starts = [df_.index.min() for df_ in crypto.values() if df_ is not None and not df_.empty]
ends = [df_.index.max() for df_ in crypto.values() if df_ is not None and not df_.empty]

start_date = max(starts)
end_date = min(ends)

fg = fg.loc[start_date:end_date]
fg.to_csv("../data/raw/fear_greed_index_trimmed.csv")

print(f"Trimmed Fear & Greed Index from {start_date.date()} to {end_date.date()}")
print(f"Total sentiment records: {fg.shape[0]}")
fg.shape