In [66]:
import pandas as pd
import yfinance as yf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def williams_r(data, lookback=14):
    highh = data['High'].rolling(lookback).max()
    lowl = data['Low'].rolling(lookback).min()
    wr = -100 * ((highh - data['Close']) / (highh - lowl))
    return wr

#stock picks from Xialin's model
csv_path = '/Users/allison/Desktop/pick.csv'
#fedfunds
ffunds_path = '/Users/allison/Desktop/FEDFUNDS.csv'
#s&p 500 sectors
sector_path = '/Users/allison/Desktop/stockssp500sectors.csv'

stocks_df = pd.read_csv(csv_path)
ffunds = pd.read_csv(ffunds_path)
ffunds['DATE'] = pd.to_datetime(ffunds['DATE'])
ffunds.set_index('DATE', inplace=True)
sectors_df = pd.read_csv(sector_path)

print("Sector CSV columns:", sectors_df.columns)
print(sectors_df.head())

stock_to_sector = dict(zip(sectors_df['Stock'], sectors_df['Sector']))

results = []

for index, row in stocks_df.iterrows():
    company = row[0]
  
    if company not in stock_to_sector:
        print(f"No sector found for {company}")
        continue
    
    sector = stock_to_sector[company]

    print(f"Company: {company}, Sector: {sector}")

    company_data = yf.download(company, start='2010-01-01', end='2024-01-01', progress=False)
    sector_data = yf.download(sector, start='2010-01-01', end='2024-01-01', progress=False)

    print(f"Company data head: {company_data.head()}")
    print(f"Sector ({sector}) data head: {sector_data.head()}")

    if company_data.empty or sector_data.empty:
        print(f"Data for {company} or {sector} is empty")
        continue

    monthly_company = company_data.resample('M').mean()
    monthly_sector = sector_data.resample('M').mean()

    if 'Close' not in monthly_company.columns or 'Close' not in monthly_sector.columns:
        print(f"Close column missing in data for {company} or {sector}")
        continue

    monthly_company['Return'] = (monthly_company['Close'].shift(-21) - monthly_company['Close']) / monthly_company['Close']
    monthly_sector['Return'] = (monthly_sector['Close'].shift(-21) - monthly_sector['Close']) / monthly_sector['Close']

    company_data['Williams_%R'] = williams_r(company_data)
    monthly_company['Williams_%R'] = company_data['Williams_%R'].resample('M').last()

    data = pd.DataFrame(index=monthly_company.index)
    data['Company_Return'] = monthly_company['Return']
    data['Sector_Return'] = monthly_sector['Return']
    data['Williams_%R'] = monthly_company['Williams_%R']

    monthly_ffunds = ffunds.resample('M').mean()
    data = data.join(monthly_ffunds['FEDFUNDS'])

    data['Target'] = (data['Company_Return'] > data['Sector_Return']).astype(int)

    data.dropna(inplace=True)

    X = data[['Company_Return', 'Sector_Return', 'FEDFUNDS', 'Williams_%R']]
    y = data['Target']

    random_state = np.random.randint(0, 42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Company: {company}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

    new_data = X_test.iloc[-1:].copy()
    prediction = model.predict(new_data)
    prediction_text = f"{company} will outperform {sector}" if prediction[0] == 1 else f"{company} will underperform {sector_etf}"
    print("Prediction: ", prediction_text)
    
    results.append([company, sector, accuracy, prediction_text])
    
    print("\n" + "="*50 + "\n")

results_df = pd.DataFrame(results, columns=['Company', 'Sector', 'Accuracy', 'Prediction'])
print(results_df)

Sector CSV columns: Index(['Stock', 'Sector', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
   Stock     Sector  Unnamed: 2  Unnamed: 3  Unnamed: 4
0  GOOGL  ^SP500-50         NaN         NaN         NaN
1   GOOG  ^SP500-50         NaN         NaN         NaN
2      T  ^SP500-50         NaN         NaN         NaN
3   CHTR  ^SP500-50         NaN         NaN         NaN
4  CMCSA  ^SP500-50         NaN         NaN         NaN
Company: BRK-B, Sector: ^SP500-40
Company data head:                  Open       High        Low      Close  Adj Close   Volume
Date                                                                      
2010-01-04  66.000000  66.500000  65.919998  66.220001  66.220001  1575000
2010-01-05  66.389999  66.589996  66.150002  66.540001  66.540001  1310000
2010-01-06  66.500000  66.500000  66.199997  66.199997  66.199997  1760000
2010-01-07  66.199997  66.510002  66.139999  66.459999  66.459999  1505000
2010-01-08  66.480003  66.480003  66.300003  66.440002  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Company data head:                  Open   High        Low      Close  Adj Close    Volume
Date                                                                   
2010-01-04  27.020000  27.48  26.820000  27.320000  18.665340  39335700
2010-01-05  27.270000  28.24  27.240000  28.070000  19.177755  55416000
2010-01-06  28.030001  28.33  27.790001  28.110001  19.205078  33237000
2010-01-07  28.120001  29.43  27.920000  29.129999  19.901958  61649000
2010-01-08  28.900000  29.35  28.600000  28.860001  19.717493  35508700
Sector (^SP500-40) data head:                   Open        High         Low       Close   Adj Close  \
Date                                                                     
2010-01-04  193.750000  198.169998  193.750000  197.789993  197.789993   
2010-01-05  197.789993  201.369995  197.759995  201.149994  201.149994   
2010-01-06  201.130005  202.600006  200.050003  202.000000  202.000000   
2010-01-07  201.970001  207.270004  201.610001  206.210007  206.210007   
201

In [67]:
outperform_results = results_df[results_df['Prediction'].str.contains("outperform")]

print(outperform_results)

   Company       Sector  Accuracy                        Prediction
0    BRK-B    ^SP500-40  0.800000   BRK-B will outperform ^SP500-40
1     EBAY  ^SP500-2550  0.900000  EBAY will outperform ^SP500-2550
2      LIN    ^SP500-15  0.933333     LIN will outperform ^SP500-15
4      ADI    ^SP500-45  0.900000     ADI will outperform ^SP500-45
5      AMD    ^SP500-45  1.000000     AMD will outperform ^SP500-45
6      AMT    ^SP500-60  0.862069     AMT will outperform ^SP500-60
7      AXP    ^SP500-40  0.933333     AXP will outperform ^SP500-40
9      BAC    ^SP500-40  0.966667     BAC will outperform ^SP500-40
10     BDX    ^SP500-35  0.933333     BDX will outperform ^SP500-35
14   CMCSA    ^SP500-50  0.933333   CMCSA will outperform ^SP500-50
15     CMG  ^SP500-2550  1.000000   CMG will outperform ^SP500-2550
18     CSX    ^SP500-20  0.933333     CSX will outperform ^SP500-20
19     CVX        ^GSPE  0.866667         CVX will outperform ^GSPE
20      DG  ^SP500-3010  0.900000    DG will out

In [68]:
import pandas as pd
import yfinance as yf
import numpy as np
from sklearn.linear_model import LinearRegression

def download_and_calculate_returns(symbols, start_date, end_date):
    returns = {}
    for symbol in symbols:
        data = yf.download(symbol, start=start_date, end=end_date, progress=False)['Adj Close']
        returns[symbol] = data.pct_change().dropna()
    return returns

# Load sector data
sector_data = pd.read_csv('/Users/allison/Desktop/sectorsymbollist.csv')
sector_symbols = sector_data['Sector'].tolist()

# Define the market symbol
market_symbol = '^GSPC'

# Define the date range
start_date = '2010-01-01'
end_date = '2024-01-01'

# Download market data
market_data = yf.download(market_symbol, start=start_date, end=end_date, progress=False)['Adj Close']
market_returns = market_data.pct_change().dropna()

# Download sector data and calculate returns
sector_returns = download_and_calculate_returns(sector_symbols, start_date, end_date)

# Prepare results storage
results = []

# Perform linear regression for each sector
for symbol in sector_symbols:
    sector_return = sector_returns[symbol].align(market_returns, join='inner')[0]
    aligned_market_return = market_returns.align(sector_return, join='inner')[0]

    # Reshape data for linear regression
    X = aligned_market_return.values.reshape(-1, 1)
    y = sector_return.values

    # Fit linear regression model
    model = LinearRegression().fit(X, y)
    
    # Store results
    results.append({
        'Sector': symbol,
        'Alpha (Intercept)': model.intercept_,
        'Beta (Coefficient)': model.coef_[0],
        'R-squared': model.score(X, y)
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)

         Sector  Alpha (Intercept)  Beta (Coefficient)  R-squared
0     ^SP500-50          -0.000105            0.845500   0.580923
1   ^SP500-2550           0.000262            1.009454   0.684567
2   ^SP500-3010           0.000134            0.644942   0.475344
3         ^GSPE          -0.000253            1.089208   0.474118
4     ^SP500-40          -0.000096            1.132644   0.763885
5     ^SP500-35           0.000103            0.790329   0.717870
6     ^SP500-20          -0.000009            1.017446   0.822150
7     ^SP500-15          -0.000122            1.044193   0.752600
8     ^SP500-60          -0.000040            0.917043   0.583074
9     ^SP500-45           0.000189            1.143733   0.843653
10    ^SP500-55          -0.000025            0.624470   0.374993


In [70]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

def download_and_calculate_returns(symbols, start_date, end_date):
    returns = {}
    for symbol in symbols:
        data = yf.download(symbol, start=start_date, end=end_date, progress=False)['Adj Close']
        returns[symbol] = data.pct_change().dropna()
    return returns

sector_data = pd.read_csv('/Users/allison/Desktop/sectorsymbollist.csv')
sector_symbols = sector_data['Sector'].tolist()

market_symbol = '^GSPC'

start_date = '2010-01-01'
end_date = '2024-01-01'

market_data = yf.download(market_symbol, start=start_date, end=end_date, progress=False)['Adj Close']
market_returns = market_data.pct_change().dropna()

sector_returns = download_and_calculate_returns(sector_symbols, start_date, end_date)

results = []

for symbol in sector_symbols:
    sector_return = sector_returns[symbol].align(market_returns, join='inner')[0]
    aligned_market_return = market_returns.align(sector_return, join='inner')[0]

    X = aligned_market_return.values.reshape(-1, 1)
    y = sector_return.values

    X_sm = sm.add_constant(X)
    model = sm.OLS(y, X_sm).fit()
    
    results.append({
        'Sector': symbol,
        'Alpha (Intercept)': model.params[0],
        'Beta (Coefficient)': model.params[1],
        'R-squared': model.rsquared,
        'P-value': model.pvalues[1]
    })

results_df = pd.DataFrame(results)
print(results_df)

         Sector  Alpha (Intercept)  Beta (Coefficient)  R-squared  P-value
0     ^SP500-50          -0.000105            0.845500   0.580923      0.0
1   ^SP500-2550           0.000262            1.009454   0.684567      0.0
2   ^SP500-3010           0.000134            0.644942   0.475344      0.0
3         ^GSPE          -0.000253            1.089208   0.474118      0.0
4     ^SP500-40          -0.000096            1.132644   0.763885      0.0
5     ^SP500-35           0.000103            0.790329   0.717870      0.0
6     ^SP500-20          -0.000009            1.017446   0.822150      0.0
7     ^SP500-15          -0.000122            1.044193   0.752600      0.0
8     ^SP500-60          -0.000040            0.917043   0.583074      0.0
9     ^SP500-45           0.000189            1.143733   0.843653      0.0
10    ^SP500-55          -0.000025            0.624470   0.374993      0.0
