## 6-Stock Indian Equity Universe
1. RELIANCE
2. HDFCBANK
3. INFY
4. TAMO
5. BHARTIARTL
6. HUL

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import os
import time

In [2]:
STOCKS = ['RELIANCE', 'HDFCBANK', 'INFY', 'M&M', 'BHARTIARTL', 'HINDUNILVR']
YF_TICKERS = {
    'RELIANCE': 'RELIANCE.NS',
    'HDFCBANK': 'HDFCBANK.NS',
    'INFY': 'INFY.NS',
    'M&M': 'M&M.NS',
    'BHARTIARTL': 'BHARTIARTL.NS',
    'HINDUNILVR': 'HINDUNILVR.NS'
}

In [3]:
start_date = '2020-01-01'
end_date = '2025-12-31'
train_end_date = '2025-09-30'
test_start_date = '2025-10-01'

In [4]:
OUTPUT_DIR = '../data/scrapped'
os.makedirs(OUTPUT_DIR, exist_ok=True)

### 1. Market Data (OHLCV): Fetch daily adjusted prices and volume for the period Jan 1, 2020, to Dec 31, 2025
- Keep Oct 2025 -Dec 2025 data separate for final forward testing.
- Source: Yahoo Finance

In [5]:
# Download data
print("Downloading data...")
stocks = []
for stock in STOCKS:
    stocks.append(YF_TICKERS[stock])
data = yf.download(stocks, start=start_date, end=end_date, group_by='ticker', auto_adjust=True, threads=False, progress=False)

output_dir = '../data/scrapped'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'market_data_raw.csv')
data.to_csv(output_file)
print(f"Data saved to {output_file}")

train_data = data.loc[:train_end_date]
test_data = data.loc[test_start_date:]

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Downloading data...
Data saved to ../data/scrapped\market_data_raw.csv
Train data shape: (1425, 30)
Test data shape: (61, 30)


In [6]:
data.describe()

Ticker,BHARTIARTL.NS,BHARTIARTL.NS,BHARTIARTL.NS,BHARTIARTL.NS,BHARTIARTL.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,RELIANCE.NS,...,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS,M&M.NS,M&M.NS,M&M.NS,M&M.NS,M&M.NS
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
count,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,...,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0,1486.0
mean,980.434095,990.885126,969.455857,980.128534,10416750.0,1147.842011,1158.962974,1136.265099,1147.187894,18129020.0,...,1344.146775,1357.11658,1330.938196,1344.051621,7554138.0,1563.437919,1583.29495,1543.279011,1563.094858,3747611.0
std,503.657731,507.487505,500.426616,504.04674,12985730.0,240.088988,240.314001,240.114534,240.345687,15165830.0,...,324.362287,326.139732,321.878708,324.143802,4968788.0,995.112179,1006.221336,984.216676,994.960321,2695251.0
min,381.532741,388.200195,345.802415,381.532715,0.0,409.864893,424.985445,391.724735,395.482513,0.0,...,437.581863,479.471143,437.581863,452.361298,0.0,250.20938,268.827826,233.110789,255.528885,0.0
25%,560.403161,568.267864,550.036775,557.71698,4394466.0,997.775441,1009.926259,985.453477,994.736282,9584944.0,...,1204.212089,1214.378709,1195.999976,1206.897614,4769261.0,761.681896,771.688715,749.051951,760.00145,2136603.0
50%,764.403849,773.31279,754.881342,764.158417,6592912.0,1161.403925,1169.942527,1150.965004,1161.183594,13322840.0,...,1396.883689,1406.068341,1382.294498,1395.131714,6452694.0,1234.557393,1245.960254,1220.060208,1232.194031,2961688.0
75%,1419.90083,1427.827377,1402.333755,1415.660156,11866340.0,1318.70824,1323.090739,1300.431217,1316.421875,20373650.0,...,1544.929338,1561.862905,1528.797782,1546.171112,8786073.0,2643.003468,2691.104527,2613.044032,2647.194763,4333060.0
max,2162.100098,2174.5,2149.899902,2162.699951,198547000.0,1592.662006,1596.980166,1573.85128,1589.138184,142683400.0,...,1938.093361,1948.777171,1920.756459,1942.221191,90432110.0,3790.0,3795.0,3740.0,3757.300049,32986480.0


### 2. Fundamental Data: Extract quarterly metrics (P/E, Debt/Equity, ROE, EPS), etc. 
- Use a suitable method to align quarterly data with daily market timestamps.
- Source: MoneyControl


In [7]:
def get_yahoo_data(ticker_symbol):
    print(f"Fetching Yahoo data for {ticker_symbol}...")
    try:
        ticker = yf.Ticker(ticker_symbol)
        
        # Quarterly Financials (Income Statement)
        fin = ticker.quarterly_financials.T
        bs = ticker.quarterly_balance_sheet.T
        cf = ticker.quarterly_cashflow.T
        
        # Combine
        combined = fin.join(bs, how='outer', lsuffix='_fin', rsuffix='_bs')
        combined = combined.join(cf, how='outer', rsuffix='_cf')
        
        # Convert index to datetime
        combined.index = pd.to_datetime(combined.index).tz_localize(None)
        
        # Get historical price data for valuation ratios
        hist = ticker.history(period="5y")
        hist.index = pd.to_datetime(hist.index).tz_localize(None)
        
        return combined, hist, ticker.info
    except Exception as e:
        print(f"Error fetching Yahoo data for {ticker_symbol}: {e}")
        return pd.DataFrame(), pd.DataFrame(), {}

In [8]:
def calculate_metrics(stock, yf_combined, yf_hist, yf_info):
    rows = []
    
    # helper
    def get_val(series, keys, default=np.nan):
        for k in keys:
            if k in series and not pd.isna(series[k]):
                try:
                    return float(series[k])
                except:
                    pass
        return default

    dates = sorted(yf_combined.index.tolist())
    for date in dates:
        row_data = {'Ticker': stock, 'Date': date}
        yf_row = yf_combined.loc[date]
        
        # Extract Fundamentals
        eps = get_val(yf_row, ['Basic EPS', 'Diluted EPS'])
        net_income = get_val(yf_row, ['Net Income', 'Net Income Common Stockholders'])
        revenue = get_val(yf_row, ['Total Revenue', 'Operating Revenue'])
        total_debt = get_val(yf_row, ['Total Debt', 'Long Term Debt And Capital Lease Obligation']) 
        equity = get_val(yf_row, ['Stockholders Equity', 'Total Equity Gross Minority Interest'])
        assets = get_val(yf_row, ['Total Assets'])
        ebit = get_val(yf_row, ['EBIT', 'Operating Income'])
        
        # EBITDA
        dep_amort = get_val(yf_row, ['Depreciation And Amortization', 'Reconciled Depreciation'])
        if pd.isna(dep_amort): dep_amort = 0
        ebitda = (ebit if not pd.isna(ebit) else 0) + dep_amort
        if ebitda == 0: ebitda = get_val(yf_row, ['EBITDA', 'Normalized EBITDA'])

        fcf = get_val(yf_row, ['Free Cash Flow'])
        inventory = get_val(yf_row, ['Inventory'])
        
        current_assets = get_val(yf_row, ['Current Assets', 'Total Current Assets'])
        current_liabilities = get_val(yf_row, ['Current Liabilities', 'Total Current Liabilities'])
        
        # Price matching
        try:
            # Find price on nearest trading day to quarter end
            idx = yf_hist.index.get_indexer([date], method='nearest')[0]
            # Check difference in days
            match_date = yf_hist.index[idx]
            diff = abs((match_date - date).days)
            if diff < 10:
                close_price = yf_hist.iloc[idx]['Close']
            else:
                close_price = np.nan
        except:
            close_price = np.nan
            
        shares_val = yf_info.get('sharesOutstanding', np.nan)
        shares_outstanding = float(shares_val) if shares_val and not pd.isna(shares_val) else np.nan
        
        metrics = {}
        
        # P/E
        if not pd.isna(close_price) and not pd.isna(eps) and eps != 0:
            metrics['P/E Ratio'] = close_price / (eps * 4) # Annualized EPS estimate
        
        # Debt/Equity
        if not pd.isna(total_debt) and not pd.isna(equity) and equity != 0:
            metrics['Debt-to-Equity'] = total_debt / equity
            
        # ROE
        if not pd.isna(net_income) and not pd.isna(equity) and equity != 0:
            metrics['ROE'] = (net_income / equity) * 4 # Annualized
            
        # ROA
        if not pd.isna(net_income) and not pd.isna(assets) and assets != 0:
            metrics['Return on Assets'] = (net_income / assets) * 4
            
        # Net Profit Margin
        if not pd.isna(net_income) and not pd.isna(revenue) and revenue != 0:
            metrics['Net Profit Margin'] = net_income / revenue
            
        # Operating Margin
        if not pd.isna(ebit) and not pd.isna(revenue) and revenue != 0:
            metrics['Operating Margin'] = ebit / revenue
            
        # Price to Book
        if not pd.isna(close_price) and not pd.isna(equity) and not pd.isna(shares_outstanding) and shares_outstanding != 0:
             book_value_per_share = equity / shares_outstanding
             metrics['Price-to-Book'] = close_price / book_value_per_share
             
        # Current Ratio
        if not pd.isna(current_assets) and not pd.isna(current_liabilities) and current_liabilities != 0:
            metrics['Current Ratio'] = current_assets / current_liabilities
            
        # Asset Turnover
        if not pd.isna(revenue) and not pd.isna(assets) and assets != 0:
             metrics['Asset Turnover'] = (revenue * 4) / assets
             
        # Inventory Turnover
        if not pd.isna(revenue) and not pd.isna(inventory) and inventory != 0:
             metrics['Inventory Turnover'] = (revenue * 4) / inventory
             
        # FCF Yield
        if not pd.isna(fcf) and not pd.isna(close_price) and not pd.isna(shares_outstanding):
             market_cap = close_price * shares_outstanding
             metrics['FCF Yield'] = (fcf * 4) / market_cap if market_cap != 0 else np.nan

        # Raw Data for Growth Calc
        metrics['Revenue'] = revenue
        metrics['EPS'] = eps
        metrics['Net Profit'] = net_income
        
        row_data.update(metrics)
        rows.append(row_data)
        
    return pd.DataFrame(rows)



In [9]:
final_dfs = []
    
for stock in STOCKS:
    yf_sym = YF_TICKERS.get(stock)
    yf_df, yf_hist, yf_info = get_yahoo_data(yf_sym)
    
    if yf_df.empty:
        print(f"Skipping {stock} - No data.")
        continue
        
    df = calculate_metrics(stock, yf_combined=yf_df, yf_hist=yf_hist, yf_info=yf_info)
    
    # Calculate Growth Metrics (Quarter over Quarter or Year over Year)
    # Let's do pct_change() which is sequential (QoQ).
    if not df.empty and 'Date' in df.columns:
        df = df.sort_values('Date')
        df['Revenue Growth'] = df['Revenue'].pct_change(fill_method=None, periods=1)
        df['Earnings Growth'] = df['EPS'].pct_change(fill_method=None, periods=1)
        
    final_dfs.append(df)
    
if final_dfs:
    all_metrics = pd.concat(final_dfs, ignore_index=True)
    # Filter relevant columns
    cols = ['Ticker', 'Date', 'EPS', 'P/E Ratio', 'Debt-to-Equity', 'Revenue Growth', 'ROE', 
            'Return on Assets', 'Net Profit Margin', 'Operating Margin', 'Price-to-Book', 
            'Current Ratio', 'Asset Turnover', 'Inventory Turnover', 'Earnings Growth', 'Net Profit', 'Revenue']
    
    # Keep only cols that exist
    cols = [c for c in cols if c in all_metrics.columns]
    all_metrics = all_metrics[cols]
    
    output_path = os.path.join(OUTPUT_DIR, 'fundamental_data.csv')
    all_metrics.to_csv(output_path, index=False)
    print(f"Successfully saved data to {output_path}")

Fetching Yahoo data for RELIANCE.NS...
Fetching Yahoo data for HDFCBANK.NS...
Fetching Yahoo data for INFY.NS...
Fetching Yahoo data for M&M.NS...
Fetching Yahoo data for BHARTIARTL.NS...
Fetching Yahoo data for HINDUNILVR.NS...
Successfully saved data to ../data/scrapped\fundamental_data.csv


In [10]:
all_metrics.head()

Unnamed: 0,Ticker,Date,EPS,P/E Ratio,Debt-to-Equity,Revenue Growth,ROE,Return on Assets,Net Profit Margin,Operating Margin,Price-to-Book,Current Ratio,Asset Turnover,Inventory Turnover,Earnings Growth,Net Profit,Revenue
0,RELIANCE,2024-06-30,11.19,34.59981,,,,,,,,,,,,,
1,RELIANCE,2024-09-30,12.24,30.038831,0.436297,,0.080849,0.0365,0.071536,0.134122,2.428721,1.080522,0.510235,5.754209,0.093834,165630000000.0,2315350000000.0
2,RELIANCE,2024-12-31,13.7,22.091512,,0.0365,,,0.077255,0.1451,,,,,0.119281,185400000000.0,2399860000000.0
3,RELIANCE,2025-03-31,,,0.438301,0.08918,0.092064,0.039807,0.074246,0.12667,2.002294,1.100351,0.536147,7.158275,,194070000000.0,2613880000000.0
4,RELIANCE,2025-06-30,19.95,18.729707,,-0.06793,,,0.110798,0.181347,,,,,,269940000000.0,2436320000000.0


### 3. Macro Indicators: Inflation, Integrate daily/monthly USD-INR rates, India 10Y Bond Yields, and Crude Oil prices, etc.
- Sources: RBI data, Yahoo Finance


### 4. Alternative Data (Sentiment): Scrape or API-fetch financial news headlines. 
- Use a pre-trained transformer model to generate daily sentiment polarity scores.
- Sources: Google News API + FinBERT
