# Time Alignment: Sentiment and Market Data

This notebook aligns sentiment scores extracted from financial text
with corresponding market price data for BTC and NIFTY 50.
Lagged sentiment features are constructed to enable lead–lag analysis
in downstream correlation and statistical evaluation.


In [17]:
import pandas as pd
import numpy as np

In [18]:
btc_prices = pd.read_csv("../data/raw/btc_prices.csv", parse_dates=["date"])
nifty_prices = pd.read_csv("../data/raw/nifty_prices.csv", parse_dates=["date"])

btc_prices.sort_values("date", inplace=True)
nifty_prices.sort_values("date", inplace=True)

btc_prices.head()
nifty_prices.head()

Unnamed: 0,date,open,high,low,close,volume
0,2023-01-02 00:00:00+05:30,18131.699219,18215.150391,18086.5,18197.449219,256100
1,2023-01-03 00:00:00+05:30,18163.199219,18251.949219,18149.800781,18232.550781,208700
2,2023-01-04 00:00:00+05:30,18230.650391,18243.0,18020.599609,18042.949219,235200
3,2023-01-05 00:00:00+05:30,18101.949219,18120.300781,17892.599609,17992.150391,269900
4,2023-01-06 00:00:00+05:30,18008.050781,18047.400391,17795.550781,17859.449219,238200


In [19]:
btc_prices["return"] = np.log(btc_prices["close"]).diff()
nifty_prices["return"] = np.log(nifty_prices["close"]).diff()

btc_prices.dropna(inplace=True)
nifty_prices.dropna(inplace=True)

In [20]:
sentiment_df = pd.read_csv(
    "../data/processed/text_with_sentiment.csv",
    parse_dates=["timestamp"]
)
sentiment_df.head()

Unnamed: 0,timestamp,text,source,asset,channel,clean_text,finbert_score,vader_score
0,2024-10-03 01:15:00+00:00,Momentum Funds : Momentum funds with 4x rise i...,,NIFTY,news_gdelt,momentum funds momentum funds with x rise in a...,0.693322,0.1779
1,2024-10-03 03:00:00+00:00,Indian stock market : 10 key things that chang...,,NIFTY,news_gdelt,indian stock market key things that changed fo...,0.856076,0.1779
2,2024-10-03 03:00:00+00:00,"Nifty 50 , Sensex today : What to expect from ...",,NIFTY,news_gdelt,nifty sensex today what to expect from indian ...,0.852286,-0.296
3,2024-10-03 06:00:00+00:00,Bitcoin Price Decline Forces $450M in Long Liq...,,BTC,news_gdelt,bitcoin price decline forces m in long liquida...,0.010625,0.0
4,2024-10-03 07:00:00+00:00,Stock Market : शेयर बाजार में बड़ी गिरावट ... ...,,NIFTY,news_gdelt,stock market,0.895819,0.0


In [26]:
daily_sentiment = (
    sentiment_df
    .groupby([sentiment_df["timestamp"].dt.date, "asset"])
    .agg({
        "finbert_score": "mean",
        "vader_score": "mean"
    })
    .reset_index()
)

daily_sentiment.rename(columns={"timestamp": "date"}, inplace=True)
daily_sentiment["date"] = pd.to_datetime(daily_sentiment["date"])
daily_sentiment["date"] = daily_sentiment["date"].dt.tz_localize(None)

daily_sentiment.head()

Unnamed: 0,date,asset,finbert_score,vader_score
0,2024-10-03,BTC,0.639643,0.220693
1,2024-10-03,NIFTY,0.397322,0.00398
2,2024-10-04,BTC,0.482686,-0.075309
3,2024-10-04,NIFTY,0.277918,-0.022869
4,2024-10-05,BTC,0.666548,0.220475


In [27]:
# Force BOTH sides to timezone-naive datetime
btc_prices["date"] = pd.to_datetime(btc_prices["date"]).dt.tz_localize(None)
daily_sentiment["date"] = pd.to_datetime(daily_sentiment["date"]).dt.tz_localize(None)

In [28]:
btc_merged = pd.merge(
    btc_prices,
    daily_sentiment[daily_sentiment["asset"] == "BTC"],
    on="date",
    how="inner"
)

btc_merged.head()

Unnamed: 0,date,close,volume,return,asset,finbert_score,vader_score
0,2024-10-03,60759.402344,36106447279,0.002086,BTC,0.639643,0.220693
1,2024-10-04,62067.476562,29585472513,0.0213,BTC,0.482686,-0.075309
2,2024-10-05,62089.949219,13305410749,0.000362,BTC,0.666548,0.220475
3,2024-10-06,62818.953125,14776233667,0.011673,BTC,0.768489,0.083333
4,2024-10-07,62236.660156,34253562610,-0.009313,BTC,0.610018,-0.254433


In [29]:
for lag in [1, 2, 3, 5]:
    btc_merged[f"finbert_lag_{lag}"] = btc_merged["finbert_score"].shift(lag)
    btc_merged[f"vader_lag_{lag}"] = btc_merged["vader_score"].shift(lag)

btc_merged.dropna(inplace=True)
btc_merged.head()

Unnamed: 0,date,close,volume,return,asset,finbert_score,vader_score,finbert_lag_1,vader_lag_1,finbert_lag_2,vader_lag_2,finbert_lag_3,vader_lag_3,finbert_lag_5,vader_lag_5
5,2024-10-08,62131.96875,28134475157,-0.001684,BTC,0.380054,0.039746,0.610018,-0.254433,0.768489,0.083333,0.666548,0.220475,0.639643,0.220693
6,2024-10-09,60582.101562,27670982363,-0.025261,BTC,0.263143,0.001271,0.380054,0.039746,0.610018,-0.254433,0.768489,0.083333,0.482686,-0.075309
7,2024-10-10,60274.5,30452813570,-0.00509,BTC,0.414146,-0.015659,0.263143,0.001271,0.380054,0.039746,0.610018,-0.254433,0.666548,0.220475
8,2024-10-11,62445.089844,30327141594,0.035378,BTC,0.178457,0.193143,0.414146,-0.015659,0.263143,0.001271,0.380054,0.039746,0.768489,0.083333
9,2024-10-12,63193.023438,16744110886,0.011906,BTC,0.526123,0.0,0.178457,0.193143,0.414146,-0.015659,0.263143,0.001271,0.610018,-0.254433


In [30]:
btc_merged.to_csv("../data/processed/btc_sentiment_aligned.csv", index=False)
print("BTC sentiment-aligned dataset saved.")

BTC sentiment-aligned dataset saved.


In [33]:
def normalize_date(df, col="date"):
    df[col] = pd.to_datetime(df[col], errors="coerce")
    df[col] = df[col].dt.tz_localize(None)
    return df

btc_prices = normalize_date(btc_prices)
nifty_prices = normalize_date(nifty_prices)
daily_sentiment = normalize_date(daily_sentiment)

In [34]:
nifty_merged = pd.merge(
    nifty_prices,
    daily_sentiment[daily_sentiment["asset"] == "NIFTY"],
    on="date",
    how="inner"
)

for lag in [1, 2, 3, 5]:
    nifty_merged[f"finbert_lag_{lag}"] = nifty_merged["finbert_score"].shift(lag)
    nifty_merged[f"vader_lag_{lag}"] = nifty_merged["vader_score"].shift(lag)

nifty_merged.dropna(inplace=True)
nifty_merged.to_csv("../data/processed/nifty_sentiment_aligned.csv", index=False)

## Notes

- Returns are computed using log differences
- Sentiment is aggregated at daily frequency
- Lagged sentiment features enable lead–lag analysis
- No modeling or correlation is performed at this stage
