In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.api as sm
import os


In [3]:
# Load the portfolio data
# if just running this notebook, use the following line to load the data
# sectors_path = os.path.join('..','Data', 'stockssp500sectors.csv')
input_path = os.path.join('Data', 'stockssp500sectors.csv')
input_df = pd.read_csv(input_path)

In [None]:
sp500_df = yf.download('^GSPC', start='2010-01-01')
monthly_sp500 = sp500_df.resample('M').mean()
monthly_sp500['Return'] = (monthly_sp500['Close'].shift(-21) - monthly_sp500['Close']) / monthly_sp500['Close']

results_df = pd.DataFrame(columns=['Sector', 'Intercept', 'Coefficient', 'R^2', 'P-Value'])
sectors_list= input_df['Sector'].unique()

for sector_symbol in sectors_list:
    print(f'Processing sector: {sector_symbol}')
    
    sector_df = yf.download(sector_symbol, start='2010-01-01')
    monthly_sector = sector_df.resample('M').mean()
    monthly_sector['Return'] = (monthly_sector['Close'].shift(-21) - monthly_sector['Close']) / monthly_sector['Close']

    monthly_sp500.dropna(inplace=True)
    monthly_sector.dropna(inplace=True)

    returns_df = pd.DataFrame({
        'SP500_Return': monthly_sp500['Return'],
        'Sector_Return': monthly_sector['Return']
    }).dropna()

    X = returns_df['SP500_Return'].values
    y = returns_df['Sector_Return'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_sm = sm.add_constant(X_train)  
    model = sm.OLS(y_train, X_train_sm).fit()

    y_pred = model.predict(sm.add_constant(X_test))

    r2 = model.rsquared

    # save the results to a dictionary
    result = {
        'Sector': sector_symbol,
        'Intercept': model.params[0],
        'Coefficient': model.params[1],
        'R^2': r2,
        'P-Value': model.pvalues[1]  # we only take the p-value of the coefficient, not the intercept
    }
    
    # save the results to a DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([result])], ignore_index=True)

In [None]:
output_path = os.path.join('results', 'Sector_LinearRegression.csv')

results_df.to_csv(output_path, index=False)

print(f"saved to {output_path}")

In [6]:
returns_df

Unnamed: 0_level_0,SP500_Return,Sector_Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-31,0.074436,0.108539
2010-02-28,0.126019,0.179620
2010-03-31,0.079228,0.180267
2010-04-30,0.086244,0.155758
2010-05-31,0.202145,0.188811
...,...,...
2022-08-31,0.258903,-0.064711
2022-09-30,0.406340,-0.050097
2022-10-31,0.486293,0.083703
2022-11-30,0.398400,0.091147
