<a href="https://colab.research.google.com/github/Niteshjai/Project/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import statsmodels.api as sm
from scipy.stats import zscore

In [121]:
def collect_data(tickers,market_data,stock_data, period):
  all_data = {}

  market_returns = market_data['^GSPC'].pct_change().dropna()
  beta_dt = pd.DataFrame(columns=tickers)
  size_dt= pd.DataFrame(columns=tickers)
  momentum_dt = pd.DataFrame(columns=tickers)
  pe_ratio_dt= pd.DataFrame(columns=tickers)
  volatility_dt= pd.DataFrame(columns=tickers)

  for ticker in tickers:
      # Handle potential KeyError if a ticker's data is missing
      if ticker in stock_data.columns:
        stock_returns = stock_data[ticker].pct_change().dropna()

        # Align returns Series by date
        returns= pd.DataFrame({
            'Stocks': stock_returns,
            'market': market_returns
        }).dropna()


        # beta=cov(stock,market)/var(market)
        try:
          rolling_window = 60  # You can change to 90, 126 etc.
          beta = returns['Stocks'].rolling(rolling_window).cov(returns['market']) / \
                returns['market'].rolling(rolling_window).var()
        except:
          beta = pd.Series(index=returns.index, data=np.nan)

        # Size=log(market cap)=log(outstanding share * price)
        try:
          info = yf.Ticker(ticker).info
          outstanding_share = info['sharesOutstanding']
          price = stock_data[ticker]
          if outstanding_share is not None and price is not None:
            size = np.log(outstanding_share * price)
          else:
            size = np.nan
        except:
          size = pd.Series(index=returns.index,data=np.nan)

        # --- Momentum (Price 1M ago to 12M ago) ---
        try:
            momentum = (stock_data[ticker].shift(21) / stock_data[ticker].shift(252)) - 1
        except:
            momentum = np.nan

        # P/E ratio
        try:
          info=yf.Ticker(ticker).info
          pe_ratio=info['trailingPE']
        except:
          pe_ratio=np.nan


        # --- Volatility (21-day rolling std of returns) ---
        try:
            volatility = returns['Stocks'].rolling(21).std()
        except:
            volatility = np.nan

        all_data[ticker] =pd.DataFrame({
                    'returns': returns['Stocks'],
                    'beta': beta,
                    'size': size,
                    'momentum': momentum,
                    'pe_ratio': pe_ratio,
                    'volatility': volatility
        })

        beta_dt[ticker]=(beta)
        size_dt[ticker]=(size)
        momentum_dt[ticker]=(momentum)
        pe_ratio_dt[ticker]=(pe_ratio)
        volatility_dt[ticker]=(volatility)

        all_data[ticker].dropna(inplace=True)

      else:
         all_data[ticker] = {
              'returns': np.nan,
              'beta': np.nan,
              'size': np.nan,
              'momentum': np.nan,
              'pe_ratio': np.nan,
              'volatility': np.nan
          }
  # Apply z-score normalization across all tickers (i.e., column-wise)
  beta_dt = beta_dt.T.apply(zscore, axis=1).T.dropna()
  size_dt = size_dt.T.apply(zscore, axis=1).T.dropna()
  momentum_dt = momentum_dt.T.apply(zscore, axis=1).T.dropna()
  pe_ratio_dt = pe_ratio_dt.T.apply(zscore, axis=1).T.dropna()
  volatility_dt = volatility_dt.T.apply(zscore, axis=1).T.dropna()

  # Now update all_data[ticker] DataFrames with normalized values
  for ticker in tickers:
    if ticker in all_data and isinstance(all_data[ticker], pd.DataFrame):
        if ticker in beta_dt.columns:
            all_data[ticker]['beta'] = beta_dt[ticker].reindex(all_data[ticker].index)
        if ticker in size_dt.columns:
            all_data[ticker]['size'] = size_dt[ticker].reindex(all_data[ticker].index)
        if ticker in momentum_dt.columns:
            all_data[ticker]['momentum'] = momentum_dt[ticker].reindex(all_data[ticker].index)
        if ticker in pe_ratio_dt.columns:
            all_data[ticker]['pe_ratio'] = pe_ratio_dt[ticker].reindex(all_data[ticker].index)
        if ticker in volatility_dt.columns:
            all_data[ticker]['volatility'] = volatility_dt[ticker].reindex(all_data[ticker].index)

  return all_data,beta_dt

In [70]:
tickers = [
    "AAPL", "MSFT", "GOOGL", "GOOG", "AMZN", "META", "NVDA", "TSLA", "BRK-B", "UNH",
    "JNJ", "V", "XOM", "JPM", "PG", "LLY", "MA", "HD", "CVX", "MRK",
    "PEP", "ABBV", "AVGO", "COST", "KO", "WMT", "MCD", "BAC", "ADBE", "CSCO",
    "PFE", "CRM", "ACN", "INTC", "TMO", "VZ", "ABT", "NFLX", "NKE", "DHR",
    "ORCL", "LIN", "TXN", "NEE", "AMGN", "UPS", "MS", "QCOM", "PM", "BMY",
    "IBM", "AMAT", "SBUX", "RTX", "CAT", "MDT", "HON", "GE", "GS", "LOW",
    "CVS", "INTU", "UNP", "PLD", "DE", "NOW", "SPGI", "ISRG", "MDLZ", "ADP",
    "LRCX", "BKNG", "SYK", "BLK", "CI", "T", "ZTS", "SCHW", "EL", "GILD",
    "MU", "ADI", "MO", "MMC", "FI", "PNC", "BDX", "ICE", "SO", "EW",
    "USB", "C", "APD", "CL", "ITW", "ETN", "FDX", "ADSK", "CSX", "AON"
]
period='5Y'
market='^GSPC'
market_data = yf.download(market, period=period, interval='1d')['Close']
stock_data = yf.download(tickers, period=period, interval='1d')['Close']



  market_data = yf.download(market, period=period, interval='1d')['Close']
[*********************100%***********************]  1 of 1 completed
  stock_data = yf.download(tickers, period=period, interval='1d')['Close']
[*********************100%***********************]  100 of 100 completed


In [None]:
all_data,beta_dt=collect_data(tickers,market_data,stock_data, period)

In [None]:

beta_dt.T.head()

In [111]:
print(all_data['AAPL'])

             returns      beta       size  momentum   pe_ratio  volatility
Date                                                                      
2021-07-01  0.092318  1.416426  28.327363  0.383128  32.417446    0.009780
2021-07-02  1.054130  1.422136  28.346771  0.366317  32.417446    0.009529
2021-07-06  0.783419  1.380355  28.361382  0.356033  32.417446    0.009278
2021-07-07  0.963043  1.386748  28.379178  0.360362  32.417446    0.009544
2021-07-08 -0.543747  1.362225  28.369935  0.338270  32.417446    0.010151
...              ...       ...        ...       ...        ...         ...
2025-06-25  0.315785  1.499056  28.733114 -0.054493  32.417446    0.011699
2025-06-26 -0.187428  1.493308  28.730332 -0.033601  32.417446    0.010382
2025-06-27 -0.011179  1.492464  28.730730 -0.036891  32.417446    0.010381
2025-06-30  1.095369  1.495726  28.750866 -0.057983  32.417446    0.011246
2025-07-01  0.737516  1.449739  28.764661 -0.057500  32.417446    0.011562

[1004 rows x 6 columns]


In [114]:
from sklearn.covariance import LedoitWolf

class FactorModel:
  def __init__(R,X,self):
    self.R=R
    self.X=X

    self.f = np.linalg.inv(self.X.T @ self.X) @ self.X.T @ self.R

  def compute_factor_covariance(self):
    T = self.f.shape[1]
    return (self.f@ self.f.T) / (T - 1)

  def compute_idiosyncratic_variance(self):
    eps = self.R - self.X @ self.f  # shape (N x T)
    eps_cov_diag = (eps@ eps.T) / (self.R.shape[1] - 1)  # N-vector
    return np.diag(np.diag(eps_cov_diag))

  def compute_total_risk_model(self, Sigma_f, D):
    return self.X @ Sigma_f @ self.X.T + D


  def shrink_covariance(self):
      lw = LedoitWolf()
      return lw.fit(self.f).covariance_  # shape (K x K)

R=stock_data.pct_change().T.dropna(inplace=True)
X=pd.DataFrame(columns=['beta','size','momentum','pe_ratio','volatility'])
for ticker in tickers:
    try:
        row = {
            'beta': all_data[ticker]['beta'],
            'size': all_data[ticker]['size'],
            'momentum': all_data[ticker]['momentum'],
            'pe_ratio': all_data[ticker]['pe_ratio'],
            'volatility': all_data[ticker]['volatility']
        }
        X.loc[ticker] = row  # Add a row with index = ticker
    except:
        continue  # Skip tickers with missing or invalid data

X.dropna(inplace=True)


factor=FactorModel(R,X)
shrink=False
Sigma_f = factor.shrink_covariance() if shrink else factor.compute_factor_covariance()
D = factor.compute_idiosyncratic_variance()
Sigma = factor.compute_total_risk_model(Sigma_f, D)


AttributeError: 'NoneType' object has no attribute 'T'

In [51]:
R.shape

(100, 251)