## 6-Stock Indian Equity Universe
1. RELIANCE
2. HDFCBANK
3. INFY
4. TAMO
5. BHARTIARTL
6. HUL

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import os
import time

In [2]:
STOCKS = ['RELIANCE', 'HDFCBANK', 'INFY', 'M&M', 'BHARTIARTL', 'HINDUNILVR']
YF_TICKERS = {
    'RELIANCE': 'RELIANCE.NS',
    'HDFCBANK': 'HDFCBANK.NS',
    'INFY': 'INFY.NS',
    'M&M': 'M&M.NS',
    'BHARTIARTL': 'BHARTIARTL.NS',
    'HINDUNILVR': 'HINDUNILVR.NS'
}

In [3]:
start_date = '2020-01-01'
end_date = '2025-12-31'
train_end_date = '2025-09-30'
test_start_date = '2025-10-01'

In [4]:
OUTPUT_DIR = '../data/scrapped'
os.makedirs(OUTPUT_DIR, exist_ok=True)
itc = pd.read_csv('../data/scrapped/ITC_Dataset_With_Fundamentals_And_News.csv')

### 1. Market Data (OHLCV): Fetch daily adjusted prices and volume for the period Jan 1, 2020, to Dec 31, 2025
- Keep Oct 2025 -Dec 2025 data separate for final forward testing.
- Source: Yahoo Finance

In [5]:
# Download data
print("Downloading data...")
stocks = []
for stock in STOCKS:
    stocks.append(YF_TICKERS[stock])
data = yf.download(stocks, start=start_date, end=end_date, group_by='ticker', auto_adjust=True, threads=False, progress=False)

output_dir = '../data/scrapped'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'market_data_raw.csv')
data.to_csv(output_file)
print(f"Data saved to {output_file}")

train_data = data.loc[:train_end_date]
test_data = data.loc[test_start_date:]

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Downloading data...
Data saved to ../data/scrapped\market_data_raw.csv
Train data shape: (1425, 30)
Test data shape: (61, 30)


In [6]:
data.describe()

Ticker,HINDUNILVR.NS,HINDUNILVR.NS,HINDUNILVR.NS,HINDUNILVR.NS,HINDUNILVR.NS,HDFCBANK.NS,HDFCBANK.NS,HDFCBANK.NS,HDFCBANK.NS,HDFCBANK.NS,...,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS,M&M.NS,M&M.NS,M&M.NS,M&M.NS,M&M.NS
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
count,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,...,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0
mean,2284.113399,2304.927895,2260.855239,2282.385852,2065980.0,743.424476,750.166428,736.594483,743.42086,25589240.0,...,1344.146778,1357.116583,1330.938198,1344.051624,7554138.0,1563.437918,1583.294949,1543.27901,1563.094857,3747611.0
std,225.620664,225.525354,226.006597,225.547314,4991731.0,133.878114,133.728243,134.771494,134.226086,22985060.0,...,324.362286,326.139731,321.878707,324.143801,4968788.0,995.112181,1006.221337,984.216678,994.960322,2695251.0
min,1671.722135,1716.000046,1587.955221,1661.149536,0.0,364.875772,383.606187,349.86305,363.573425,0.0,...,437.581948,479.471236,437.581948,452.361328,0.0,250.20938,268.827826,233.110789,255.528946,0.0
25%,2137.346521,2156.53863,2115.43461,2134.764587,1189353.0,679.598457,684.823525,672.581181,678.58226,13605460.0,...,1204.212061,1214.378647,1195.999885,1206.897583,4769261.0,761.681896,771.688715,749.051936,760.001465,2136603.0
50%,2309.222175,2330.714966,2290.050993,2307.547485,1597431.0,740.701338,746.807106,734.095403,740.398102,20368010.0,...,1396.88369,1406.068341,1382.294377,1395.131775,6452694.0,1234.557393,1245.960255,1220.060177,1232.194031,2961688.0
75%,2446.26242,2465.973229,2424.930921,2445.61615,2227237.0,810.919091,816.933336,804.216909,810.681793,32078810.0,...,1544.929185,1561.862905,1528.797752,1546.171173,8786073.0,2643.003468,2691.104527,2613.044032,2647.194763,4333060.0
max,2931.155855,2945.714548,2882.578181,2939.454346,185669900.0,1017.5,1020.5,1008.5,1012.900024,445342100.0,...,1938.093361,1948.777171,1920.756459,1942.221191,90432110.0,3790.0,3795.0,3740.0,3757.300049,32986480.0


### 2. Fundamental Data: Extract quarterly metrics (P/E, Debt/Equity, ROE, EPS), etc. 
- Use a suitable method to align quarterly data with daily market timestamps.
- Source: MoneyControl


In [7]:
def get_yahoo_data(ticker_symbol):
    print(f"Fetching Yahoo data for {ticker_symbol}...")
    try:
        ticker = yf.Ticker(ticker_symbol)
        
        # Quarterly Financials (Income Statement)
        fin = ticker.quarterly_financials.T
        bs = ticker.quarterly_balance_sheet.T
        cf = ticker.quarterly_cashflow.T
        
        # Combine
        combined = fin.join(bs, how='outer', lsuffix='_fin', rsuffix='_bs')
        combined = combined.join(cf, how='outer', rsuffix='_cf')
        
        # Convert index to datetime
        combined.index = pd.to_datetime(combined.index).tz_localize(None)
        
        # Get historical price data for valuation ratios
        hist = ticker.history(period="5y")
        hist.index = pd.to_datetime(hist.index).tz_localize(None)
        
        return combined, hist, ticker.info
    except Exception as e:
        print(f"Error fetching Yahoo data for {ticker_symbol}: {e}")
        return pd.DataFrame(), pd.DataFrame(), {}

In [8]:
def calculate_metrics(stock, yf_combined, yf_hist, yf_info):
    rows = []
    
    # helper
    def get_val(series, keys, default=np.nan):
        for k in keys:
            if k in series and not pd.isna(series[k]):
                try:
                    return float(series[k])
                except:
                    pass
        return default

    dates = sorted(yf_combined.index.tolist())
    for date in dates:
        row_data = {'Ticker': stock, 'Date': date}
        yf_row = yf_combined.loc[date]
        
        # Extract Fundamentals
        eps = get_val(yf_row, ['Basic EPS', 'Diluted EPS'])
        net_income = get_val(yf_row, ['Net Income', 'Net Income Common Stockholders'])
        revenue = get_val(yf_row, ['Total Revenue', 'Operating Revenue'])
        total_debt = get_val(yf_row, ['Total Debt', 'Long Term Debt And Capital Lease Obligation']) 
        equity = get_val(yf_row, ['Stockholders Equity', 'Total Equity Gross Minority Interest'])
        assets = get_val(yf_row, ['Total Assets'])
        ebit = get_val(yf_row, ['EBIT', 'Operating Income'])
        
        # EBITDA
        dep_amort = get_val(yf_row, ['Depreciation And Amortization', 'Reconciled Depreciation'])
        if pd.isna(dep_amort): dep_amort = 0
        ebitda = (ebit if not pd.isna(ebit) else 0) + dep_amort
        if ebitda == 0: ebitda = get_val(yf_row, ['EBITDA', 'Normalized EBITDA'])

        fcf = get_val(yf_row, ['Free Cash Flow'])
        inventory = get_val(yf_row, ['Inventory'])
        
        current_assets = get_val(yf_row, ['Current Assets', 'Total Current Assets'])
        current_liabilities = get_val(yf_row, ['Current Liabilities', 'Total Current Liabilities'])
        
        # Price matching
        try:
            # Find price on nearest trading day to quarter end
            idx = yf_hist.index.get_indexer([date], method='nearest')[0]
            # Check difference in days
            match_date = yf_hist.index[idx]
            diff = abs((match_date - date).days)
            if diff < 10:
                close_price = yf_hist.iloc[idx]['Close']
            else:
                close_price = np.nan
        except:
            close_price = np.nan
            
        shares_val = yf_info.get('sharesOutstanding', np.nan)
        shares_outstanding = float(shares_val) if shares_val and not pd.isna(shares_val) else np.nan
        
        metrics = {}
        
        # P/E
        if not pd.isna(close_price) and not pd.isna(eps) and eps != 0:
            metrics['P/E Ratio'] = close_price / (eps * 4) # Annualized EPS estimate
        
        # Debt/Equity
        if not pd.isna(total_debt) and not pd.isna(equity) and equity != 0:
            metrics['Debt-to-Equity'] = total_debt / equity
            
        # ROE
        if not pd.isna(net_income) and not pd.isna(equity) and equity != 0:
            metrics['ROE'] = (net_income / equity) * 4 # Annualized
            
        # ROA
        if not pd.isna(net_income) and not pd.isna(assets) and assets != 0:
            metrics['Return on Assets'] = (net_income / assets) * 4
            
        # Net Profit Margin
        if not pd.isna(net_income) and not pd.isna(revenue) and revenue != 0:
            metrics['Net Profit Margin'] = net_income / revenue
            
        # Operating Margin
        if not pd.isna(ebit) and not pd.isna(revenue) and revenue != 0:
            metrics['Operating Margin'] = ebit / revenue
            
        # Price to Book
        if not pd.isna(close_price) and not pd.isna(equity) and not pd.isna(shares_outstanding) and shares_outstanding != 0:
             book_value_per_share = equity / shares_outstanding
             metrics['Price-to-Book'] = close_price / book_value_per_share
             
        # Current Ratio
        if not pd.isna(current_assets) and not pd.isna(current_liabilities) and current_liabilities != 0:
            metrics['Current Ratio'] = current_assets / current_liabilities
            
        # Asset Turnover
        if not pd.isna(revenue) and not pd.isna(assets) and assets != 0:
             metrics['Asset Turnover'] = (revenue * 4) / assets
             
        # Inventory Turnover
        if not pd.isna(revenue) and not pd.isna(inventory) and inventory != 0:
             metrics['Inventory Turnover'] = (revenue * 4) / inventory
             
        # FCF Yield
        if not pd.isna(fcf) and not pd.isna(close_price) and not pd.isna(shares_outstanding):
             market_cap = close_price * shares_outstanding
             metrics['FCF Yield'] = (fcf * 4) / market_cap if market_cap != 0 else np.nan

        # Raw Data for Growth Calc
        metrics['Revenue'] = revenue
        metrics['EPS'] = eps
        metrics['Net Profit'] = net_income
        
        row_data.update(metrics)
        rows.append(row_data)
        
    return pd.DataFrame(rows)



In [9]:
final_dfs = []
    
for stock in STOCKS:
    yf_sym = YF_TICKERS.get(stock)
    yf_df, yf_hist, yf_info = get_yahoo_data(yf_sym)
    
    if yf_df.empty:
        print(f"Skipping {stock} - No data.")
        continue
        
    df = calculate_metrics(stock, yf_combined=yf_df, yf_hist=yf_hist, yf_info=yf_info)
    
    # Calculate Growth Metrics (Quarter over Quarter or Year over Year)
    # Let's do pct_change() which is sequential (QoQ).
    if not df.empty and 'Date' in df.columns:
        df = df.sort_values('Date')
        df['Revenue Growth'] = df['Revenue'].pct_change(fill_method=None, periods=1)
        df['Earnings Growth'] = df['EPS'].pct_change(fill_method=None, periods=1)
        
    final_dfs.append(df)
    
if final_dfs:
    all_metrics = pd.concat(final_dfs, ignore_index=True)
    # Filter relevant columns
    cols = ['Ticker', 'Date', 'EPS', 'P/E Ratio', 'Debt-to-Equity', 'Revenue Growth', 'ROE', 
            'Return on Assets', 'Net Profit Margin', 'Operating Margin', 'Price-to-Book', 
            'Current Ratio', 'Asset Turnover', 'Inventory Turnover', 'Earnings Growth', 'Net Profit', 'Revenue']
    
    # Keep only cols that exist
    cols = [c for c in cols if c in all_metrics.columns]
    all_metrics = all_metrics[cols]
    
    output_path = os.path.join(OUTPUT_DIR, 'fundamental_data.csv')
    all_metrics.to_csv(output_path, index=False)
    print(f"Successfully saved data to {output_path}")

Fetching Yahoo data for RELIANCE.NS...
Fetching Yahoo data for HDFCBANK.NS...
Fetching Yahoo data for INFY.NS...
Fetching Yahoo data for M&M.NS...
Fetching Yahoo data for BHARTIARTL.NS...
Fetching Yahoo data for HINDUNILVR.NS...
Successfully saved data to ../data/scrapped\fundamental_data.csv


In [10]:
all_metrics.head()

Unnamed: 0,Ticker,Date,EPS,P/E Ratio,Debt-to-Equity,Revenue Growth,ROE,Return on Assets,Net Profit Margin,Operating Margin,Price-to-Book,Current Ratio,Asset Turnover,Inventory Turnover,Earnings Growth,Net Profit,Revenue
0,RELIANCE,2024-06-30,11.19,34.59981,,,,,,,,,,,,,
1,RELIANCE,2024-09-30,12.24,30.038831,0.436297,,0.080849,0.0365,0.071536,0.134122,2.428721,1.080522,0.510235,5.754209,0.093834,165630000000.0,2315350000000.0
2,RELIANCE,2024-12-31,13.7,22.091512,,0.0365,,,0.077255,0.1451,,,,,0.119281,185400000000.0,2399860000000.0
3,RELIANCE,2025-03-31,,,0.438301,0.08918,0.092064,0.039807,0.074246,0.12667,2.002294,1.100351,0.536147,7.158275,,194070000000.0,2613880000000.0
4,RELIANCE,2025-06-30,19.95,18.729707,,-0.06793,,,0.110798,0.181347,,,,,,269940000000.0,2436320000000.0


### 3. Macro Indicators: Inflation, Integrate daily/monthly USD-INR rates, India 10Y Bond Yields, and Crude Oil prices, etc.
- Sources: RBI data, Yahoo Finance


In [5]:
yf_tickers = ['USDINR=X', 'GBPINR=X', 'EURINR=X', 'JPYINR=X', 'BZ=F']
print(f"Downloading Yahoo Finance data for: {', '.join(yf_tickers)}")
macro_daily_raw = yf.download(yf_tickers, start=start_date, end=end_date, progress=False)

if isinstance(macro_daily_raw.columns, pd.MultiIndex):
    macro_daily = macro_daily_raw['Close'].copy()
else:
    macro_daily = macro_daily_raw[['Close']].copy()

rename_map = {
    'USDINR=X': 'USD_INR',
    'GBPINR=X': 'GBP_INR',
    'EURINR=X': 'EUR_INR',
    'JPYINR=X': 'JPY_INR',
    'BZ=F': 'Brent_Crude'
}
macro_daily = macro_daily.rename(columns=rename_map)

print("Downloading FRED data (CSV)...")
# INDCPIALLMINMEI: Inflation (Monthly)
# INDIRLTLT01STM: Central Govt 10Y Bond Yield (Monthly)
# INDIR3TIB01STM: 91-Day T-Bill (Interbank Proxy, Monthly)
# IRSTCB01INM156N: Repo Rate (Monthly, note data may end early)
fred_series = {
    'Inflation': 'INDCPIALLMINMEI',
    'Central_10Y_Yield': 'INDIRLTLT01STM',
    '91D_TBill_Proxy': 'INDIR3TIB01STM',
    'RBI_Repo_Rate_Proxy': 'IRSTCB01INM156N'
}

fred_data_list = []
for name, series_id in fred_series.items():
    url = f'https://fred.stlouisfed.org/graph/fredgraph.csv?id={series_id}'
    try:
        df = pd.read_csv(url)
        df['observation_date'] = pd.to_datetime(df['observation_date'])
        df.set_index('observation_date', inplace=True)
        df.columns = [name]
        fred_data_list.append(df)
    except Exception as e:
        print(f"Error fetching {name} ({series_id}) from FRED: {e}")

if fred_data_list:
    macro_fred = pd.concat(fred_data_list, axis=1)
    daily_index = pd.date_range(start=start_date, end=end_date, freq='D')
    macro_fred_daily = macro_fred.reindex(daily_index).ffill()
else:
    print("Warning: No FRED data downloaded.")
    macro_fred_daily = pd.DataFrame()

print("Combining all macro indicators...")
macro_daily.index = pd.to_datetime(macro_daily.index).tz_localize(None)
macro_combined = pd.concat([macro_daily, macro_fred_daily], axis=1)

macro_combined = macro_combined.loc[start_date:end_date]

macro_output_path = os.path.join(OUTPUT_DIR, 'macro_indicators.csv')
macro_combined.to_csv(macro_output_path)
print(f"Macro indicators saved to {macro_output_path}")

Downloading Yahoo Finance data for: USDINR=X, GBPINR=X, EURINR=X, JPYINR=X, BZ=F
Downloading FRED data (CSV)...
Combining all macro indicators...
Macro indicators saved to ../data/scrapped\macro_indicators.csv


In [6]:
macro_combined.head()

Unnamed: 0,Brent_Crude,EUR_INR,GBP_INR,JPY_INR,USD_INR,Inflation,Central_10Y_Yield,91D_TBill_Proxy,RBI_Repo_Rate_Proxy
2020-01-01,,79.932999,94.487213,0.655997,71.275803,126.2353,6.82,5.07,5.4
2020-01-02,66.25,79.93,94.132767,0.652031,71.025002,126.2353,6.82,5.07,5.4
2020-01-03,68.599998,79.622002,93.837059,0.656595,71.415001,126.2353,6.82,5.07,5.4
2020-01-04,,,,,,126.2353,6.82,5.07,5.4
2020-01-05,,,,,,126.2353,6.82,5.07,5.4


In [7]:
macro = pd.read_csv('../data/scrapped/macro_indicators.csv')

first_col = macro.columns[0]
macro.rename(columns={first_col: 'date'}, inplace=True)
macro['date'] = pd.to_datetime(macro['date'])
itc['date'] = pd.to_datetime(itc['date'])

itc_macro_cols = [
    'date', 
    'US Dollar', 'Pound Sterling', 'Euro', 'Japanese Yen',
    'Central Government Dated Securities', 
    'State Government Dated Securities', 
    '91-Day', '182-Day', '364-Day', 'RBI'
]
itc_subset = itc[itc_macro_cols].copy()

merged_df = pd.merge(macro, itc_subset, on='date', how='left')

merged_df_output_path = os.path.join(OUTPUT_DIR, 'macro_indicators.csv')
merged_df.to_csv(merged_df_output_path)
print(f"Macro indicators saved to {merged_df_output_path}")

Macro indicators saved to ../data/scrapped\macro_indicators.csv


In [8]:
macro = pd.read_csv('../data/scrapped/macro_indicators.csv')
df = macro.drop(columns=['Central_10Y_Yield', '91D_TBill_Proxy', 'RBI_Repo_Rate_Proxy'])

mapping = {
    'USD_INR': 'US Dollar',
    'GBP_INR': 'Pound Sterling',
    'EUR_INR': 'Euro',
    'JPY_INR': 'Japanese Yen'
}

for src, target in mapping.items():
    if src in df.columns:
        if target not in df.columns:
            df[target] = df[src]
        else:
            df[target] = df[target].fillna(df[src])

target_order = [
    'date', 
    'US Dollar', 
    'Pound Sterling', 
    'Euro', 
    'Japanese Yen',
    'Central Government Dated Securities', 
    'State Government Dated Securities', 
    '91-Day', 
    '182-Day', 
    '364-Day', 
    'RBI', 
    'Brent_Crude', 
    'Inflation'
]

clean_df = df[[col for col in target_order if col in df.columns]].copy()

clean_df.to_csv(os.path.join(OUTPUT_DIR, 'macro_indicators.csv'))

print("Cleaning complete. Saved to 'macro_indicators.csv'")

Cleaning complete. Saved to 'macro_indicators.csv'


In [9]:
clean_df.head()

Unnamed: 0,date,US Dollar,Pound Sterling,Euro,Japanese Yen,Central Government Dated Securities,State Government Dated Securities,91-Day,182-Day,364-Day,RBI,Brent_Crude,Inflation
0,2020-01-01,71.275803,94.487213,79.932999,0.655997,,,,,,,,126.2353
1,2020-01-02,71.025002,94.132767,79.93,0.652031,,,,,,,66.25,126.2353
2,2020-01-03,71.415001,93.837059,79.622002,0.656595,,,,,,,68.599998,126.2353
3,2020-01-04,,,,,,,,,,,,126.2353
4,2020-01-05,,,,,,,,,,,,126.2353


### 4. Alternative Data (Sentiment): Scrape or API-fetch financial news headlines. 
- Use a pre-trained transformer model to generate daily sentiment polarity scores.
- Sources: Google News API + FinBERT
