In [5]:
mapping = {
    # ===== EQUITIES =====
    "RELIANCE": {
        "keyword": "Reliance Industries India",
        "symbol": "RELIANCE.NS"
    },
    "ITC": {
        "keyword": "ITC Limited India",
        "symbol": "ITC.NS"
    },
    "HDFCBANK": {
        "keyword": "HDFC Bank India",
        "symbol": "HDFCBANK.NS"
    },
    "LIC": {
        "keyword": "Life Insurance Corporation of India insurance sector",
        "symbol": "LICI.NS"
    },
    "TATASTEEL": {
        "keyword": "Tata Steel India steel sector metal stocks",
        "symbol": "TATASTEEL.NS"
    },
    "APOLLO": {
        "keyword": "Apollo Hospitals India healthcare hospital sector",
        "symbol": "APOLLOHOSP.NS"
    },
    "VBL": {
        "keyword": "Varun Beverages India FMCG beverage industry",
        "symbol": "VBL.NS"
    },
    # ===== COMMODITY / MACRO (if already present earlier) =====
    "GOLD": {
        "keyword": "precious metals market gold prices",
        "symbol": "GC=F"
    },
    "SILVER": {
        "keyword": "precious metals market silver prices",
        "symbol": "SI=F"
    },
    "COPPER": {
        "keyword": "base metals market copper prices",
        "symbol": "HG=F"
    },
    "CRUDEOIL": {
        "keyword": "crude oil prices global energy market",
        "symbol": "CL=F"
    }
}

## Data Collection

In [3]:
import yfinance as yf
import pandas as pd

In [30]:
def fetch_price(asset, start, end):

    symbol = mapping[asset]["symbol"]
   
    df = yf.download(
        symbol,
        start=start,
        end=end,
        auto_adjust=False
    )

    df.reset_index(inplace=True)
    df["asset"] = asset

    df["Date"] = pd.to_datetime(df["Date"])
    
    return df

In [175]:
re_df = fetch_price("RELIANCE","2023-01-01","2026-01-01")
re_df.head()

[*********************100%***********************]  1 of 1 completed


Price,Date,Adj Close,Close,High,Low,Open,Volume,asset
Ticker,Unnamed: 1_level_1,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,Unnamed: 8_level_1
0,2023-01-02,1175.889648,1188.777466,1190.20813,1175.993896,1176.824585,5316175,RELIANCE
1,2023-01-03,1167.284668,1180.078125,1187.439087,1175.809326,1183.770142,7658932,RELIANCE
2,2023-01-04,1149.709595,1162.310425,1181.924194,1160.210571,1180.055054,9264891,RELIANCE
3,2023-01-05,1147.655396,1160.233643,1170.548218,1155.595581,1164.594849,13637099,RELIANCE
4,2023-01-06,1158.086304,1170.778931,1175.87854,1162.195068,1166.048584,6349597,RELIANCE


In [176]:
re_df.shape

(740, 8)

In [37]:
re_df.dtypes

Price      Ticker     
Date                      datetime64[ns]
Adj Close  RELIANCE.NS           float64
Close      RELIANCE.NS           float64
High       RELIANCE.NS           float64
Low        RELIANCE.NS           float64
Open       RELIANCE.NS           float64
Volume     RELIANCE.NS             int64
asset                             object
dtype: object

In [84]:
def standardize_df(df):
    df = df.copy()

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.droplevel(1)
        
    df.reset_index(inplace=True)
    if "index" in df.columns:
        df.drop(columns=["index"], inplace=True)

    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    
    df.columns = df.columns.str.lower()
    df = df.sort_values("date").reset_index(drop=True)

    return df

In [177]:
re_df = standardize_df(re_df)
re_df.head()

Price,date,adj close,close,high,low,open,volume,asset
0,2023-01-02,1175.889648,1188.777466,1190.20813,1175.993896,1176.824585,5316175,RELIANCE
1,2023-01-03,1167.284668,1180.078125,1187.439087,1175.809326,1183.770142,7658932,RELIANCE
2,2023-01-04,1149.709595,1162.310425,1181.924194,1160.210571,1180.055054,9264891,RELIANCE
3,2023-01-05,1147.655396,1160.233643,1170.548218,1155.595581,1164.594849,13637099,RELIANCE
4,2023-01-06,1158.086304,1170.778931,1175.87854,1162.195068,1166.048584,6349597,RELIANCE


In [40]:
def save(df, asset):
    df.to_csv(f"../data/raw/market/{asset}.csv")

In [112]:
for asset, keyword in mapping.items():
    df = fetch_price(asset, "2023-01-01", "2026-01-01")
    df = standardize_df(df)
    save(df, asset)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [127]:
def load(asset):
    df = pd.read_csv(f"../data/raw/market/{asset}.csv")
    #df["Date"] = pd.to_datetime(df["Date"])
    return df

## Feature Engineering

In [45]:
import numpy as np

In [46]:
def returns(df):
    df["close"] = pd.to_numeric(df["close"], errors="coerce")
    df["return"] = np.log(df["close"]/df["close"].shift(1))

In [178]:
returns(re_df)
re_df.head()

Price,date,adj close,close,high,low,open,volume,asset,return
0,2023-01-02,1175.889648,1188.777466,1190.20813,1175.993896,1176.824585,5316175,RELIANCE,
1,2023-01-03,1167.284668,1180.078125,1187.439087,1175.809326,1183.770142,7658932,RELIANCE,-0.007345
2,2023-01-04,1149.709595,1162.310425,1181.924194,1160.210571,1180.055054,9264891,RELIANCE,-0.015171
3,2023-01-05,1147.655396,1160.233643,1170.548218,1155.595581,1164.594849,13637099,RELIANCE,-0.001788
4,2023-01-06,1158.086304,1170.778931,1175.87854,1162.195068,1166.048584,6349597,RELIANCE,0.009048


In [55]:
# relative strength index
def rsi(df, window=14):
    series = df["close"]
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()
    
    rs = avg_gain/avg_loss
    rsi = 100 - (100 / (1+rs) )

    df["rsi"] = rsi
    df["rsi_sentiment"] = np.tanh((df["rsi"] - 60) / 10)

In [179]:
rsi(re_df)
re_df.value_counts()

date        adj close    close        high         low          open         volume    asset     return     rsi        rsi_sentiment
2025-12-31  1570.400024  1570.400024  1577.000000  1541.000000  1541.000000  5771830   RELIANCE   0.019678  62.398237   0.235329        1
2023-01-20  1115.061523  1127.282593  1142.212036  1124.790527  1142.212036  14930286  RELIANCE  -0.011964  31.902747  -0.992773        1
2023-01-23  1109.423706  1121.583008  1138.150879  1119.137085  1130.213135  10954118  RELIANCE  -0.005069  32.476142  -0.991898        1
2023-01-24  1102.873047  1114.960571  1127.744019  1101.761597  1126.059570  16488755  RELIANCE  -0.005922  34.800001  -0.987136        1
2023-01-25  1087.625977  1099.546387  1114.383667  1098.369629  1113.345337  12385018  RELIANCE  -0.013921  32.055119  -0.992550        1
                                                                                                                                       ..
2023-02-27  1080.915527  1092.762329  1

In [57]:
# exponential moving average
def ema(df):
    
    df["ema_5"] = df["close"].ewm(span=5, adjust=False).mean()
    df["ema_13"] = df["close"].ewm(span=13, adjust=False).mean()
    
    df["ema_diff"] = (df["ema_5"] - df["ema_13"]) / df["ema_13"]
    df["ema_sentiment"] = (df["ema_diff"] - df["ema_diff"].mean()) / df["ema_diff"].std()

In [180]:
ema(re_df)
re_df.sample(10)

Price,date,adj close,close,high,low,open,volume,asset,return,rsi,rsi_sentiment,ema_5,ema_13,ema_diff,ema_sentiment
37,2023-02-23,1080.755737,1092.60083,1105.176758,1089.762695,1097.446655,9851910,RELIANCE,-0.004804,55.683493,-0.4067,1104.300767,1103.081397,0.001105,-0.022034
313,2024-04-12,1456.370728,1467.150024,1486.474976,1463.074951,1475.875,15551012,RELIANCE,-0.008433,56.169273,-0.365373,1471.402539,1469.464826,0.001319,-0.003598
674,2025-09-25,1372.400024,1372.400024,1384.5,1369.0,1381.300049,11411143,RELIANCE,-0.007694,48.50919,-0.817449,1385.797371,1389.947249,-0.002986,-0.375757
366,2024-07-02,1553.675659,1565.175049,1573.5,1557.0,1569.5,7749246,RELIANCE,0.003216,73.773011,0.880345,1542.899667,1507.72126,0.023332,1.899744
99,2023-05-30,1150.348755,1162.956543,1171.148193,1160.37207,1163.25647,11342504,RELIANCE,-0.000258,55.903924,-0.388139,1151.907507,1140.872587,0.009672,0.718683
499,2025-01-13,1234.917847,1239.849976,1245.25,1226.400024,1230.0,13764861,RELIANCE,-0.001652,55.215428,-0.445007,1243.708883,1241.146461,0.002065,0.060895
411,2024-09-05,1487.035889,1492.974976,1525.775024,1487.050049,1519.050049,16264168,RELIANCE,-0.014348,55.934926,-0.385503,1505.369929,1502.824273,0.001694,0.028848
580,2025-05-14,1418.733765,1424.400024,1429.900024,1415.5,1420.0,5570160,RELIANCE,0.006127,72.325902,0.843329,1416.067363,1390.371704,0.018481,1.480312
201,2023-10-26,1105.070801,1113.25,1129.0,1110.150024,1125.5,15222534,RELIANCE,-0.014026,30.099897,-0.994955,1133.013116,1149.943975,-0.014723,-1.390615
705,2025-11-12,1511.5,1511.5,1524.699951,1500.0,1505.0,12384459,RELIANCE,0.012047,62.983763,0.289826,1495.769284,1480.10881,0.010581,0.797214


In [91]:
# volume
def vol(df):
    
    df["volume_mean"] = df["volume"].rolling(20).mean()
    df["volume_std"] = df["volume"].rolling(20).std()

    df["volume_z"] = (df["volume"] - df["volume_mean"])/df["volume_std"]

    df["volume_sent"] = df["volume_z"].clip(-3,3) / 3
    df["volume_price_pressure"] = df["return"] * df["volume_sent"]

In [181]:
vol(re_df)
re_df.sample(8)

Price,date,adj close,close,high,low,open,volume,asset,return,rsi,rsi_sentiment,ema_5,ema_13,ema_diff,ema_sentiment,volume_mean,volume_std,volume_z,volume_sent,volume_price_pressure
621,2025-07-10,1511.164551,1517.199951,1524.699951,1507.5,1519.699951,10047129,RELIANCE,-0.001186,64.072851,0.386165,1522.904954,1508.606381,0.009478,0.701879,10317007.7,5155736.0,-0.052345,-0.017448,2.1e-05
608,2025-06-23,1451.004883,1456.800049,1463.800049,1442.0,1453.0,5989078,RELIANCE,-0.006432,67.319384,0.62425,1448.509145,1440.274241,0.005718,0.376745,10430764.55,5019062.0,-0.884963,-0.294988,0.001897
237,2023-12-19,1269.65271,1279.050049,1286.949951,1262.599976,1277.5,16820604,RELIANCE,0.014609,77.594654,0.942443,1253.843934,1232.810475,0.017061,1.357557,12718769.3,5078787.0,0.807641,0.269214,0.003933
416,2024-09-12,1473.91333,1479.800049,1486.0,1445.875,1457.0,22349376,RELIANCE,0.019309,44.113811,-0.919937,1471.471361,1482.353547,-0.007341,-0.752346,14271000.8,9636863.0,0.838278,0.279426,0.005396
449,2024-10-30,1338.553955,1343.900024,1350.0,1325.349976,1335.0,11984423,RELIANCE,0.002906,38.205895,-0.974736,1340.646828,1353.239953,-0.009306,-0.922223,18750557.3,8532735.0,-0.792962,-0.264321,-0.000768
490,2024-12-31,1210.614868,1215.449951,1219.099976,1206.150024,1208.0,6405475,RELIANCE,0.003916,28.121015,-0.996601,1217.901081,1234.165671,-0.013179,-1.257066,13876762.75,5783368.0,-1.291857,-0.430619,-0.001686
603,2025-06-16,1432.080444,1437.800049,1442.400024,1424.400024,1426.900024,8458093,RELIANCE,0.006909,56.156886,-0.366446,1437.241264,1433.382867,0.002692,0.115129,9370961.55,2771174.0,-0.329416,-0.109805,-0.000759
419,2024-09-17,1466.443237,1472.300049,1477.474976,1466.625,1474.0,5935328,RELIANCE,0.000646,41.55023,-0.951271,1471.891525,1478.548885,-0.004503,-0.50692,13696099.7,9940430.0,-0.780728,-0.260243,-0.000168


In [173]:
# vix - volatility index
def vix(df):
    df["asset_volatility"] = df["return"].rolling(20).std()
    df["asset_vol_sentiment"] = -(df["asset_volatility"] - df["asset_volatility"].mean()) / df["asset_volatility"].std()

    nifty = yf.download("^NSEI", start="2023-01-01", end="2026-02-01", progress=False, auto_adjust=True)
    if isinstance(nifty.columns, pd.MultiIndex):
        nifty.columns = nifty.columns.droplevel(1)
    
    nifty = nifty.reset_index()
    nifty.columns = nifty.columns.str.lower()
    nifty.rename(columns={"date": "date"}, inplace=True)
    nifty["date"] = pd.to_datetime(nifty["date"])

    nifty["close"] = pd.to_numeric(nifty["close"], errors="coerce")
    nifty["return"] = np.log(nifty["close"]/nifty["close"].shift(1))
    nifty["market_volatility"] = nifty["return"].rolling(20).std()

    nifty["market_vol_sentiment"] = -(nifty["market_volatility"] - nifty["market_volatility"].mean())/nifty["market_volatility"].std()
    nifty = nifty[["date", "market_volatility", "market_vol_sentiment"]]

    df = df.merge(
        nifty,
        on="date",
        how="left"
    )

    df["market_volatility"] = df["market_volatility"].ffill()
    df["market_vol_sentiment"] = df["market_vol_sentiment"].ffill()

    return df

In [182]:
re_df = vix(re_df)
re_df.shape

(740, 24)

In [149]:
cols = [
    "return", "rsi", "ema_5", "ema_13", "volume_sent",
    "asset_volatility", "asset_vol_sentiment", "market_vol_sentiment"
]

def drop_initial_nans(df, cols):
    df = df.copy()
    valid_mask = df[cols].notna().all(axis=1)

    if not valid_mask.any():
        raise ValueError("No row has all required columns non-NaN")

    # first index where everything becomes valid
    start_idx = valid_mask.idxmax()
    df = df.loc[start_idx:].reset_index(drop=True)
    
    return df

In [184]:
re_df = drop_initial_nans(re_df, cols)
re_df.head()

Price,date,adj close,close,high,low,open,volume,asset,return,rsi,...,ema_sentiment,volume_mean,volume_std,volume_z,volume_sent,volume_price_pressure,asset_volatility,asset_vol_sentiment,market_volatility,market_vol_sentiment
0,2023-01-31,1074.524536,1086.301392,1101.900146,1080.371094,1099.292603,22691594,RELIANCE,-0.002503,17.913746,...,-1.842711,14015668.35,5337007.0,1.625616,0.541872,-0.001357,0.011529,0.151048,0.00749,-0.149232
1,2023-02-01,1068.156494,1079.863525,1098.346558,1063.75708,1098.346558,19347800,RELIANCE,-0.005944,19.004846,...,-1.828352,14600111.75,5243449.0,0.905451,0.301817,-0.001794,0.011515,0.154699,0.00745,-0.134524
2,2023-02-02,1062.244751,1073.887085,1083.601562,1066.526123,1069.756592,13662663,RELIANCE,-0.00555,22.059145,...,-1.830836,14820000.35,5098134.0,-0.227012,-0.075671,0.00042,0.011241,0.226767,0.007164,-0.030548
3,2023-02-03,1063.180664,1074.83313,1084.06311,1058.219116,1084.06311,24699575,RELIANCE,0.000881,23.024532,...,-1.73842,15373124.15,5543685.0,1.682356,0.560785,0.000494,0.011284,0.215509,0.007889,-0.293837
4,2023-02-06,1055.169067,1066.733765,1071.141113,1064.126343,1068.37207,14837879,RELIANCE,-0.007564,23.570426,...,-1.732833,15797538.25,5125664.0,-0.187226,-0.062409,0.000472,0.010891,0.318684,0.007795,-0.259681


In [185]:
re_df.shape

(720, 24)

In [183]:
re_df[cols].isna().sum()

Price
return                   1
rsi                     14
ema_5                    0
ema_13                   0
volume_sent             19
asset_volatility        20
asset_vol_sentiment     20
market_vol_sentiment    20
dtype: int64