# Market Data Collection: BTC & NIFTY 50

This notebook collects historical daily price data for:
- Bitcoin (BTC-USD)
- NIFTY 50 Index (^NSEI)

The data will be used for downstream sentiment alignment
and correlation analysis.

In [None]:
# %% Imports
import pandas as pd
import yfinance as yf
import requests
from datetime import datetime, timedelta
from pytrends.request import TrendReq

In [None]:
# %% Global Config
BTC_START = "2015-01-01"
NIFTY_START = "2010-01-01"
END_DATE = "2025-12-31"

DATA_RAW = "../data/raw"

In [None]:
# %% Price Data
def fetch_price_data(ticker, start, end):
    df = yf.download(ticker, start=start, end=end, progress=False)
    df = df.reset_index()
    df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
    df.columns = ["date", "open", "high", "low", "close", "volume"]
    df["return"] = df["close"].pct_change()
    return df.dropna()

btc_prices = fetch_price_data("BTC-USD", BTC_START, END_DATE)
nifty_prices = fetch_price_data("^NSEI", NIFTY_START, END_DATE)

btc_prices.to_csv(f"{DATA_RAW}/btc_prices.csv", index=False)
nifty_prices.to_csv(f"{DATA_RAW}/nifty_prices.csv", index=False)

btc_prices.head(), nifty_prices.head()

In [None]:
# %% GDELT News Fetcher
def fetch_gdelt(query, start_date, end_date):
    records = []
    current = start_date

    while current < end_date:
        import time
        time.sleep(1.5)
        next_month = current + timedelta(days=30)

        url = "https://api.gdeltproject.org/api/v2/doc/doc"
        params = {
            "query": "",
            "theme": "CRYPTOCURRENCY OR ECON_MARKET",
            "mode": "artlist",
            "format": "json",
            "maxrecords": 250,
            "startdatetime": start_date,
            "enddatetime": end_date,
        }


        try:
            r = requests.get(url, params=params, timeout=20)
            data = r.json().get("articles", [])
            for a in data:
                records.append({
                    "timestamp": a.get("seendate"),
                    "text": a.get("title", ""),
                    "source": "gdelt",
                    "asset": query
                })
        except Exception:
            pass

        current = next_month

    return pd.DataFrame(records)

btc_gdelt = fetch_gdelt("bitcoin OR cryptocurrency", datetime(2020,1,1), datetime(2025,12,31))
#nifty_gdelt = fetch_gdelt("NIFTY OR Sensex OR Indian stock market", datetime(2023,1,1), datetime(2025,12,31))

btc_gdelt.head() #, nifty_gdelt.head()

In [None]:
def fetch_yahoo_news(ticker, asset):
    t = yf.Ticker(ticker)
    news = t.news or []
    records = []

    for n in news:
        ts = (
            n.get("providerPublishTime")
            or n.get("time_published")
            or None
        )
        if ts is None:
            continue

        records.append({
            "timestamp": pd.to_datetime(ts, unit="s", errors="coerce"),
            "text": n.get("title", ""),
            "source": "yahoo",
            "asset": asset
        })

    return pd.DataFrame(records).dropna()

In [None]:
# %% Google Trends
pytrends = TrendReq(hl="en-US", tz=360)

def fetch_trends(keyword):
    pytrends.build_payload([keyword], timeframe="all")
    df = pytrends.interest_over_time()
    df = df.reset_index()
    df = df[["date", keyword]]
    df.columns = ["date", "trend"]
    return df

btc_trends = fetch_trends("Bitcoin")
nifty_trends = fetch_trends("NIFTY 50")

In [None]:
import requests

url = "https://api.alternative.me/fng/?limit=0&format=json"
data = requests.get(url).json()["data"]

df = pd.DataFrame(data)
df["date"] = pd.to_datetime(df["timestamp"], unit="s")
df["fear_greed"] = df["value"].astype(int)
df = df[["date", "fear_greed"]]

In [None]:
# %% India VIX
vix = fetch_price_data("^INDIAVIX", NIFTY_START, END_DATE)
vix = vix[["date", "close"]]
vix.columns = ["date", "india_vix"]

vix.head()

In [None]:
# %% Twitter API (to be enabled later)
def fetch_twitter_placeholder():
    return pd.DataFrame(columns=["timestamp", "text", "source", "asset"])

btc_twitter = fetch_twitter_placeholder()
nifty_twitter = fetch_twitter_placeholder()

In [None]:
# %% Combine Text Sources
text_df = pd.concat([
    btc_gdelt, nifty_gdelt,
    btc_yahoo, nifty_yahoo,
    btc_twitter, nifty_twitter
], ignore_index=True)

text_df["timestamp"] = pd.to_datetime(text_df["timestamp"], errors="coerce")
text_df = text_df.dropna(subset=["timestamp"])

text_df.to_csv(f"{DATA_RAW}/text_data.csv", index=False)
text_df.head()

In [None]:
btc_trends.to_csv(f"{DATA_RAW}/btc_trends.csv", index=False)
nifty_trends.to_csv(f"{DATA_RAW}/nifty_trends.csv", index=False)
btc_fear_greed.to_csv(f"{DATA_RAW}/btc_fear_greed.csv", index=False)
vix.to_csv(f"{DATA_RAW}/india_vix.csv", index=False)

## Summary

- BTC and NIFTY 50 historical price data collected successfully
- Data stored in `/data/raw/`
- No preprocessing or transformations applied at this stage
- Next step: Text data collection and preprocessing