In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

In [4]:
#stock picks from Xialin's model
csv_path = os.path.join('results', 'ML_result.csv')
#fedfunds
ffunds_path = os.path.join('Data', 'FEDFUNDS.csv')
#s&p 500 sectors
sector_path = os.path.join('Data', 'stockssp500sectors.csv')

sector_data_path = os.path.join('results', 'Sector_LinearRegression.csv')

In [None]:
def williams_r(data, lookback=14):
    highh = data['High'].rolling(lookback).max()
    lowl = data['Low'].rolling(lookback).min()
    wr = -100 * ((highh - data['Close']) / (highh - lowl))
    return wr

stocks_df = pd.read_csv(csv_path)
ffunds = pd.read_csv(ffunds_path)
ffunds['DATE'] = pd.to_datetime(ffunds['DATE'])
ffunds.set_index('DATE', inplace=True)
sectors_df = pd.read_csv(sector_path)

print("Sector CSV columns:", sectors_df.columns)
print(sectors_df.head())

stock_to_sector = dict(zip(sectors_df['Stock'], sectors_df['Sector']))

results = []

for index, row in stocks_df.iterrows():
    company = row[0]
  
    if company not in stock_to_sector:
        print(f"No sector found for {company}")
        continue
    
    sector = stock_to_sector[company]

    print(f"Company: {company}, Sector: {sector}")

    company_data = yf.download(company, start='2010-01-01', end='2024-01-01', progress=False)
    sector_data = yf.download(sector, start='2010-01-01', end='2024-01-01', progress=False)

    print(f"Company data head: {company_data.head()}")
    print(f"Sector ({sector}) data head: {sector_data.head()}")

    if company_data.empty or sector_data.empty:
        print(f"Data for {company} or {sector} is empty")
        continue

    monthly_company = company_data.resample('M').mean()
    monthly_sector = sector_data.resample('M').mean()

    if 'Close' not in monthly_company.columns or 'Close' not in monthly_sector.columns:
        print(f"Close column missing in data for {company} or {sector}")
        continue

    monthly_company['Return'] = (monthly_company['Close'].shift(-21) - monthly_company['Close']) / monthly_company['Close']
    monthly_sector['Return'] = (monthly_sector['Close'].shift(-21) - monthly_sector['Close']) / monthly_sector['Close']

    company_data['Williams_%R'] = williams_r(company_data)
    monthly_company['Williams_%R'] = company_data['Williams_%R'].resample('M').last()

    data = pd.DataFrame(index=monthly_company.index)
    data['Company_Return'] = monthly_company['Return']
    data['Sector_Return'] = monthly_sector['Return']
    data['Williams_%R'] = monthly_company['Williams_%R']

    monthly_ffunds = ffunds.resample('M').mean()
    data = data.join(monthly_ffunds['FEDFUNDS'])

    data['Target'] = (data['Company_Return'] > data['Sector_Return']).astype(int)

    data.dropna(inplace=True)

    X = data[['Company_Return', 'Sector_Return', 'FEDFUNDS', 'Williams_%R']]
    y = data['Target']

    random_state = np.random.randint(0, 42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Company: {company}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

    new_data = X_test.iloc[-1:].copy()
    prediction = model.predict(new_data)
    prediction_text = f"{company} will outperform {sector}" if prediction[0] == 1 else f"{company} will underperform {sector}"
    print("Prediction: ", prediction_text)
    
    # 获取公司的beta值
    company_info = yf.Ticker(company)
    beta = company_info.info.get('beta', None)
    
    results.append([company, sector, accuracy, prediction_text, beta])
    
    print("\n" + "="*50 + "\n")

results_df = pd.DataFrame(results, columns=['Company', 'Sector', 'Accuracy', 'Prediction', 'Beta'])
print(results_df)

In [67]:
outperform_results = results_df[results_df['Prediction'].str.contains("outperform")]

print(outperform_results)

   Company       Sector  Accuracy                        Prediction
0    BRK-B    ^SP500-40  0.800000   BRK-B will outperform ^SP500-40
1     EBAY  ^SP500-2550  0.900000  EBAY will outperform ^SP500-2550
2      LIN    ^SP500-15  0.933333     LIN will outperform ^SP500-15
4      ADI    ^SP500-45  0.900000     ADI will outperform ^SP500-45
5      AMD    ^SP500-45  1.000000     AMD will outperform ^SP500-45
6      AMT    ^SP500-60  0.862069     AMT will outperform ^SP500-60
7      AXP    ^SP500-40  0.933333     AXP will outperform ^SP500-40
9      BAC    ^SP500-40  0.966667     BAC will outperform ^SP500-40
10     BDX    ^SP500-35  0.933333     BDX will outperform ^SP500-35
14   CMCSA    ^SP500-50  0.933333   CMCSA will outperform ^SP500-50
15     CMG  ^SP500-2550  1.000000   CMG will outperform ^SP500-2550
18     CSX    ^SP500-20  0.933333     CSX will outperform ^SP500-20
19     CVX        ^GSPE  0.866667         CVX will outperform ^GSPE
20      DG  ^SP500-3010  0.900000    DG will out

In [None]:

df = outperform_results[['Company', 'Beta', 'Sector']]
sectors_data = pd.read_csv(sector_data_path)
df = df.merge(sectors_data[['Sector', 'Coefficient']], left_on='Sector', right_on='Sector', how='left')

df = df.rename(columns={'Company': 'Stock'})

In [None]:
output_path = os.path.join('results', 'portfolio_pick.csv')

results_df.to_csv(output_path, index=False)

print(f"saved to {output_path}")