In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
import os
from datetime import date, timedelta

In [2]:
nifty50 = [
    "RELIANCE.NS", "TCS.NS", "INFY.NS", "HDFCBANK.NS", "ICICIBANK.NS", "SBIN.NS", "AXISBANK.NS", "LT.NS", 
    "HINDUNILVR.NS", "ITC.NS", "KOTAKBANK.NS", "BAJFINANCE.NS", "BAJAJFINSV.NS", "BHARTIARTL.NS", "CIPLA.NS",
    "COALINDIA.NS", "DRREDDY.NS", "EICHERMOT.NS", "GRASIM.NS", "HCLTECH.NS", "HDFCLIFE.NS", "HEROMOTOCO.NS",
    "HINDALCO.NS", "INDUSINDBK.NS", "JSWSTEEL.NS", "M&M.NS", "MARUTI.NS", "NESTLEIND.NS", "NTPC.NS", "ONGC.NS",
    "POWERGRID.NS", "SBILIFE.NS", "SHRIRAMFIN.NS", "SUNPHARMA.NS", "TATACONSUM.NS", "TATAMOTORS.NS",
    "TATASTEEL.NS", "TECHM.NS", "TITAN.NS", "ULTRACEMCO.NS", "WIPRO.NS", "ADANIENT.NS", "ADANIPORTS.NS", "JIOFIN.NS"
]

benchmark_indices = {
    "NIFTY 50": "^NSEI",
    "NIFTY BANK": "^NSEBANK",
    "NIFTY IT": "^CNXIT",
    "NIFTY FMCG": "^CNXFMCG",
    "NIFTY AUTO": "^CNXAUTO",
    "NIFTY PHARMA": "^CNXPHARMA"
}

In [5]:
symbols = nifty50 + list(benchmark_indices.values())

In [7]:
end_date = date.today()
start_date = end_date - timedelta(days=365)
data = yf.download(symbols, start=start_date, end=end_date, auto_adjust=True, progress=True)["Close"].dropna(how="all")


[*********************100%***********************]  50 of 50 completed


In [11]:
available_stocks = data.columns.intersection(nifty50)
available_indices = data.columns.intersection(list(benchmark_indices.values()))

stock_returns = data[available_stocks].pct_change(fill_method=None).dropna()
index_returns = data[available_indices].pct_change(fill_method=None).dropna()
common_dates = stock_returns.index.intersection(index_returns.index)
stock_returns = stock_returns.loc[common_dates]
index_returns = index_returns.loc[common_dates]

In [14]:
os.makedirs("plots", exist_ok=True)

In [22]:
results = []
for stock in stock_returns.columns:
    best_r2 = -1
    best_index = None
    for index_name in index_returns.columns:
        df = pd.concat([stock_returns[stock], index_returns[index_name]], axis=1).dropna()
        if df.empty:
            continue

        X = df.iloc[:, 1].values.reshape(-1, 1)
        y = df.iloc[:, 0].values.reshape(-1, 1)

        try:
            model = LinearRegression()
            model.fit(X, y)
            alpha = model.intercept_[0]
            beta = model.coef_[0][0]
            r2 = model.score(X, y)
            corr = np.corrcoef(X.ravel(), y.ravel())[0, 1]

            results.append({
                "Stock": stock,
                "Index": index_name,
                "Alpha": alpha,
                "Beta": beta,
                "R2": r2,
                "Correlation": corr
            })

            if (
                r2 > best_r2 and
                abs(beta) <= 3 and
                corr >= 0.3
            ):
                best_r2 = r2
                best_index = index_name

            # Save plot
            plt.figure(figsize=(6, 4))
            sns.regplot(x=X.ravel(), y=y.ravel(), line_kws={'color': 'red'})
            plt.title(f"{stock} vs {index_name}")
            plt.xlabel("Index Return")
            plt.ylabel("Stock Return")
            plt.tight_layout()
            plt.savefig(f"plots/{stock.replace('.NS','')}_{index_name}.png")
            plt.close()

        except Exception as e:
            print(f" Regression failed for {stock} vs {index_name}: {e}")

    print(f"✅ Best index for {stock}: {best_index} (R2 = {best_r2:.4f})")


✅ Best index for ADANIENT.NS: ^NSEI (R2 = 0.2688)
✅ Best index for ADANIPORTS.NS: ^NSEI (R2 = 0.3800)
✅ Best index for AXISBANK.NS: ^NSEBANK (R2 = 0.4894)
✅ Best index for BAJAJFINSV.NS: ^NSEI (R2 = 0.3793)
✅ Best index for BAJFINANCE.NS: ^NSEI (R2 = 0.3033)
✅ Best index for BHARTIARTL.NS: ^NSEI (R2 = 0.3437)
✅ Best index for CIPLA.NS: ^CNXPHARMA (R2 = 0.5047)
✅ Best index for COALINDIA.NS: ^NSEI (R2 = 0.3551)
✅ Best index for DRREDDY.NS: ^CNXPHARMA (R2 = 0.4014)
✅ Best index for EICHERMOT.NS: ^CNXAUTO (R2 = 0.4064)
✅ Best index for GRASIM.NS: ^NSEI (R2 = 0.4002)
✅ Best index for HCLTECH.NS: ^CNXIT (R2 = 0.6580)
✅ Best index for HDFCBANK.NS: ^NSEBANK (R2 = 0.6176)
✅ Best index for HDFCLIFE.NS: ^NSEI (R2 = 0.2079)
✅ Best index for HEROMOTOCO.NS: ^CNXAUTO (R2 = 0.4132)
✅ Best index for HINDALCO.NS: ^NSEI (R2 = 0.3514)
✅ Best index for HINDUNILVR.NS: ^CNXFMCG (R2 = 0.5886)
✅ Best index for ICICIBANK.NS: ^NSEBANK (R2 = 0.5171)
✅ Best index for INDUSINDBK.NS: ^NSEBANK (R2 = 0.1013)
✅ Best i

In [24]:
df_results = pd.DataFrame(results)
if not df_results.empty:
    df_results.to_csv("regression_results_yfinance.csv", index=False)

    summary = df_results.sort_values("R2", ascending=False).groupby("Stock").first().reset_index()
    summary.to_csv("best_index_per_stock_yfinance.csv", index=False)
    print("\n✅ Summary saved to 'best_index_per_stock_yfinance.csv'")

    # 📊 Add index-level summary
    index_summary = df_results.groupby("Index")[["Alpha", "Beta", "R2", "Correlation"]].mean().reset_index()
    index_summary = index_summary.sort_values("R2", ascending=False)
    index_summary.to_csv("summary_by_index.csv", index=False)
    print("📊 Index-level summary saved to 'summary_by_index.csv'")
    display(index_summary)

else:
    print("⚠️ No regression results available. Check if data downloaded properly.")



✅ Summary saved to 'best_index_per_stock_yfinance.csv'
📊 Index-level summary saved to 'summary_by_index.csv'


Unnamed: 0,Index,Alpha,Beta,R2,Correlation
5,^NSEI,-9.1e-05,1.023316,0.317274,0.554395
0,^CNXAUTO,0.000289,0.629377,0.248921,0.486135
4,^NSEBANK,-0.00046,0.781795,0.232154,0.460171
2,^CNXIT,9e-05,0.419235,0.163621,0.359152
3,^CNXPHARMA,-0.000274,0.544267,0.14855,0.359119
1,^CNXFMCG,0.000133,0.602223,0.135391,0.340324
