# Text Data Collection

This notebook collects financial news articles and social media posts
relevant to BTC and NIFTY 50 during the study period.
The resulting dataset serves as input for sentiment analysis.


In [1]:
import pandas as pd
import requests
import yfinance as yf
from datetime import datetime, timedelta

In [2]:
import os
os.makedirs("data/raw", exist_ok=True)

In [None]:
START_DATE = "2023-01-01"
END_DATE = "2024-12-31"

In [5]:
def fetch_gdelt_news(keyword, start_date, end_date, asset, max_records=1000):
    records = []

    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    curr = start
    while curr <= end:
        date_str = curr.strftime("%Y%m%d")

        url = (
            "https://api.gdeltproject.org/api/v2/doc/doc"
            f"?query={keyword}&mode=artlist"
            f"&maxrecords=250"
            f"&format=json"
            f"&startdatetime={date_str}000000"
            f"&enddatetime={date_str}235959"
        )

        try:
            r = requests.get(url, timeout=10)
            if r.status_code == 200:
                data = r.json()
                for a in data.get("articles", []):
                    records.append({
                        "timestamp": pd.to_datetime(a["seendate"]),
                        "text": a.get("title", ""),
                        "source": a.get("source", ""),
                        "asset": asset,
                        "channel": "gdelt"
                    })
        except:
            pass

        if len(records) >= max_records:
            break

        curr += timedelta(days=1)

    return pd.DataFrame(records)

In [None]:
btc_gdelt = fetch_gdelt_news(
    keyword="bitcoin OR cryptocurrency",
    start_date="2016-01-01",
    end_date="2024-12-31",
    asset="BTC",
    max_records=3000
)

nifty_gdelt = fetch_gdelt_news(
    keyword="NIFTY OR Indian stock market",
    start_date="2015-01-01",
    end_date="2024-12-31",
    asset="NIFTY",
    max_records=3000
)

btc_gdelt.head(), nifty_gdelt.head()

In [None]:
gdelt_df = pd.concat([btc_gdelt, nifty_gdelt], ignore_index=True)

gdelt_df["timestamp"] = pd.to_datetime(gdelt_df["timestamp"]).dt.tz_localize(None)
gdelt_df["date"] = gdelt_df["timestamp"].dt.date

gdelt_df = gdelt_df.drop_duplicates(subset=["timestamp", "text", "asset"])
gdelt_df.head(), len(gdelt_df)

In [None]:
gdelt_df.to_csv("data/raw/gdelt_data.csv", index=False)
print(f"Saved {len(gdelt_df)} total gdelt records")

In [None]:
def fetch_yahoo_news(ticker, asset):
    stock = yf.Ticker(ticker)
    records = []
    try:

        for n in stock.news:
            records.append({
            "timestamp": pd.to_datetime(n["providerPublishTime"], unit="s"),
            "text": n["title"],
            "source": n["publisher"],
            "asset": asset,
            "channel": "yahoo_finance"
            })
    except Exception as e:
        print(f"Yahoo news fetch failed for {ticker}: {e}")

    return pd.DataFrame(records)


In [None]:
btc_yahoo = fetch_yahoo_news("BTC-USD", "BTC")
nifty_yahoo = fetch_yahoo_news("^NSEI", "NIFTY")

yahoo_df = pd.concat([btc_yahoo, nifty_yahoo], ignore_index=True)
yahoo_df["timestamp"] = pd.to_datetime(yahoo_df["timestamp"]).dt.tz_localize(None)
yahoo_df["date"] = yahoo_df["timestamp"].dt.date

yahoo_df.head(), len(yahoo_df)

In [None]:
news_df = pd.concat([gdelt_df, yahoo_df], ignore_index=True)
news_df = news_df.dropna(subset=["text"])

news_df.to_csv("data/raw/news_data.csv", index=False)
print(f"Total news records: {len(news_df)}")

In [None]:
text_df.to_csv("../data/raw/text_data.csv", index=False)
print(f"Saved {len(text_df)} total text records")