In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import yfinance as yf
from pandas_datareader import data as pdr
import ta

# import talib
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
)
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import traceback

# from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from concurrent.futures import ThreadPoolExecutor
import requests

# from yahoo_fin import stock_info
from bs4 import BeautifulSoup

# Retreive data

In [22]:
def save_sp500_tickers():
    """
    Scrapes S&P500 tickers from Wikipedia.

    Returns:
        pandas.Series: A series of S&P500 tickers.
    """
    df_tables = pd.read_html(
        "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    )
    tickers = df_tables[0]["Symbol"]
    return tickers


save_sp500_tickers()

0       MMM
1       AOS
2       ABT
3      ABBV
4       ACN
       ... 
498     YUM
499    ZBRA
500     ZBH
501    ZION
502     ZTS
Name: Symbol, Length: 503, dtype: object

In [20]:
def retrieve_data(ticker):
    """
    Retrieves historical data for a given ticker.

    Args:
        ticker (str): The stock ticker.

    Returns:
        pandas.DataFrame: A dataframe with historical data for the given ticker.
    """
    start = datetime(2010, 1, 1)
    end = datetime.now() - timedelta(days=1)

    try:
        yf.pdr_override()
        df = pdr.get_data_yahoo(ticker, start, end)
        df = df[["Open", "Close", "High", "Low", "Volume"]]
        df.columns = [
            f"{ticker}_Open",
            f"{ticker}_Close",
            f"{ticker}_High",
            f"{ticker}_Low",
            f"{ticker}_Volume",
        ]

        return df

    except Exception as e:
        print(f"Could not retrieve data for {ticker}: {e}")
        return pd.DataFrame()

In [23]:
def retrieve_save_combined_data():
    """
    Retrieves historical data for all S&P500 tickers and saves the data to a CSV file.

    Returns:
        None
    """
    tickers = save_sp500_tickers()
    data = []
    with ThreadPoolExecutor(max_workers=1) as executor:
        for result in executor.map(retrieve_data, tickers):
            print(f"Retrieved data for ticker with length {len(result)}")
            data.append(result)

    combined_df = pd.concat(data, axis=1, join="outer")

    print(f"Final combined DataFrame shape: {combined_df.shape}")

    return True

# Testing grounds

In [None]:
df = pd.read_csv("fresh_data.csv", parse_dates=True, index_col=0)
ticker = "AAPL"
lag = 3

rolling_std_shifted = df[f"{ticker}_Close"].rolling(window=20).mean()

rolling_std_shifted.mean()

other_tickers = [
            col[:-6]
            for col in df.columns
            if col.endswith("_Close") and col != f"{ticker}_Close"
        ]
other_ma_lag = pd.concat(
    [
        df[f"{other_ticker}_Close"]
        .pct_change()
        .rolling(window=lag)
        .mean()
        .shift(periods=1)
        for other_ticker in other_tickers
    ],
    axis=1,
)

df[f"Market_MA_{lag}"] = other_ma_lag.mean(axis=1)

other_ma_lag


# Create X and Y for ML

In [10]:
def calculate_metrics(
    ticker, label_period=9, label_threshold=0.02, feature_lags=[3, 6, 9]
):
    """
    Calculate a variety of metrics for a given stock ticker, including moving averages, rate of change (ROC),
    on-balance volume (OBV), and the relative strength index (RSI). Also calculates lagged versions of these metrics
    based on the `lags` input.

    Args:
        ticker (str): The stock ticker to calculate metrics for.
        period (int): The period to use for calculating metrics.
        threshold (float): The threshold to use for calculating metrics.
        lags (list of int): The lag periods to use for calculating lagged metrics.
        refresh (bool, optional): Whether to refresh the data. Defaults to False.

    Returns:
        pandas.DataFrame: A DataFrame with the calculated metrics and labels.
    """

    df = pd.read_csv("fresh_data.csv", parse_dates=True, index_col=0)

    # Calculate the percentage change over the given period
    df[f"{ticker}_pct_change"] = (
        df[f"{ticker}_Close"]
        .pct_change(periods=label_period)
        .shift(periods=-label_period)
    )

    # Create a new column for the labels and initially set all labels to 0
    df[f"{ticker}_label"] = 0

    # Assign labels based on the threshold
    df.loc[df[f"{ticker}_pct_change"] > label_threshold, f"{ticker}_label"] = 1
    df.loc[df[f"{ticker}_pct_change"] < -label_threshold, f"{ticker}_label"] = -1

    # Drop the percentage change column as it's no longer needed
    df = df.dropna(subset=[f"{ticker}_pct_change"])

    metrics_df = pd.DataFrame()

    # Calculate base metrics
    rsi = ta.momentum.RSIIndicator(df[f"{ticker}_Close"], window=14).rsi()
    metrics_df[f"{ticker}_RSI"] = rsi.shift(periods=1)

    rolling_std_shifted = (
        df[f"{ticker}_Close"].rolling(window=20).std().shift(periods=1)
    )
    metrics_df[f"{ticker}_Bollinger_Up"] = (
        df[f"{ticker}_Close"].rolling(window=20).mean().shift(periods=1)
        + 2 * rolling_std_shifted
    )
    metrics_df[f"{ticker}_Bollinger_Down"] = (
        df[f"{ticker}_Close"].rolling(window=20).mean().shift(periods=1)
        - 2 * rolling_std_shifted
    )

    adx = talib.ADX(
        df[f"{ticker}_High"], df[f"{ticker}_Low"], df[f"{ticker}_Close"], timeperiod=14
    )
    metrics_df[f"{ticker}_ADX"] = adx.shift(periods=1)

    macd_line, signal_line, _ = talib.MACD(
        df[f"{ticker}_Close"], fastperiod=12, slowperiod=26, signalperiod=9
    )
    metrics_df[f"{ticker}_MACD"] = (macd_line - signal_line).shift(periods=1)

    obv = ta.volume.OnBalanceVolumeIndicator(
        df[f"{ticker}_Close"], df[f"{ticker}_Volume"]
    ).on_balance_volume()
    metrics_df[f"{ticker}_OBV"] = obv.shift(periods=1)

    # Calculate lagged metrics
    for lag in feature_lags:
        # Use shift to create lagged features, to avoid looking ahead in time
        metrics_df[f"{ticker}_Delta_ADX_{lag}"] = adx.diff(lag).shift(periods=1)

        metrics_df[f"{ticker}_MA_{lag}"] = (
            df[f"{ticker}_Close"].rolling(window=lag).mean().shift(periods=1)
        )
        metrics_df[f"{ticker}_ROC_{lag}"] = talib.ROC(
            df[f"{ticker}_Close"], timeperiod=lag
        ).shift(periods=1)
        metrics_df[f"{ticker}_OBV_ROC_{lag}"] = obv.pct_change(periods=lag).shift(
            periods=1
        )
        metrics_df[f"{ticker}_Delta_RSI_{lag}"] = rsi.diff(lag).shift(periods=1)

        other_tickers = [
            col[:-6]
            for col in df.columns
            if col.endswith("_Close") and col != f"{ticker}_Close"
        ]
        other_ma_lag = pd.concat(
            [
                df[f"{other_ticker}_Close"]
                .pct_change()
                .rolling(window=lag)
                .mean()
                .shift(periods=1)
                for other_ticker in other_tickers
            ],
            axis=1,
        )
        metrics_df[f"Market_MA_{lag}"] = other_ma_lag.mean(axis=1)

    # Handle missing values
    metrics_df = metrics_df.replace([np.inf, -np.inf], np.nan)
    metrics_df.dropna(inplace=True)

    final_df = metrics_df.join(df[f"{ticker}_label"])

    # # Validate data types
    assert set(final_df.dtypes) <= {
        np.dtype("float64"),
        np.dtype("int64"),
    }, "Unexpected data types in DataFrame"

    return final_df


calculate_metrics("AAPL")

FileNotFoundError: [Errno 2] No such file or directory: 'fresh_data.csv'

In [None]:
def X_and_Y_for_ML(ticker, label_period=9, label_threshold=0.02, feature_lags=[3, 6, 9], days_back = 30, refresh=False):
    
    if refresh:
        retrieve_save_combined_data()
    
    df = calculate_metrics(
        ticker, label_period=label_period, label_threshold=label_threshold, feature_lags=feature_lags)

    X = df.drop(columns=[f"{ticker}_label"])
    Y = df[f"{ticker}_label"].copy()

    # Use all data except last day for training, and last day for testing
    X_train, X_test, Y_train, Y_test = X[:-days_back], X[-days_back:], Y[:-days_back], Y[-days_back:]

    # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

    # X_train, X_test, Y_train, Y_test = X_train.values, X_test.values, Y_train.values, Y_test.values

    return X_train, X_test, Y_train, Y_test




# Model Selection

# Best Random Forest

In [None]:
X_train, X_test, Y_train, Y_test = X_and_Y_for_ML("AAPL", label_period=9, label_threshold=0.022, days_back=5)
X_train

In [63]:
random_forest_best_params = {
    'bootstrap': False,
    'max_depth': 30,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 300
}

random_forest = Pipeline([
    ("model", RandomForestClassifier(**random_forest_best_params, random_state=1))
])
random_forest.fit(X_train, Y_train)

predictions = random_forest.predict(X_test)

accuracy = cross_val_score(random_forest, X_train, Y_train, n_jobs=-1, cv=5)

accuracy, accuracy.mean()

(array([0.41702586, 0.23706897, 0.22437972, 0.28263215, 0.4368932 ]),
 0.31959997954097386)

# Best Boosting

In [65]:
boosting_best_params = {'learning_rate': 0.2,
  'max_depth': 7,
  'n_estimators': 300}

boosting_model = Pipeline([
        ("model", GradientBoostingClassifier(**boosting_best_params, random_state=1))])


accuracy = cross_val_score(boosting_model, X_train, Y_train, n_jobs=-1, cv=5)

accuracy, accuracy.mean()

(array([0.4137931 , 0.24892241, 0.23840345, 0.29989213, 0.39805825]),
 0.3198138693598185)

# Best SVC

In [66]:
SVC_best_params = {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

SVC_model = Pipeline([
        ("trans", StandardScaler()),
        ("model", SVC(**SVC_best_params, random_state=1, probability=True))
    ])

accuracy = cross_val_score(SVC_model, X_train, Y_train, n_jobs=-1, cv=5)
accuracy, accuracy.mean()

(array([0.37823276, 0.31681034, 0.31499461, 0.30636462, 0.43257821]),
 0.34979610720529697)

# Best K nearest

In [67]:
K_best_params = {'algorithm': 'auto',
  'leaf_size': 20,
  'n_neighbors': 3,
  'weights': 'distance'}

K_model = Pipeline([
        ("trans", StandardScaler()),
        ("model", KNeighborsClassifier(**K_best_params))
    ])

accuracy = cross_val_score(K_model, X_train, Y_train, n_jobs=-1, cv=5)
accuracy, accuracy.mean()

(array([0.39978448, 0.30818966, 0.31067961, 0.29341963, 0.43473571]),
 0.349361817877469)

In [None]:
# models = {
#     # Random Forest..was here 

# #     "Gradient_boosting": Pipeline([
# #         ("model", GradientBoostingClassifier(),
# #   )]),
#     # "SVClass": Pipeline([
#     #     ("trans", StandardScaler()),
#     #     ("model", SVC())
#     # ]),
# #     "Neighbors" : Pipeline([
# #         ("trans", StandardScaler()),
# #         ("model", KNeighborsClassifier())
# #     ])
# }


In [None]:

# create an empty DataFrame to store the results
scores_df = pd.DataFrame()

# loop through the models dictionary
for model_name, model in models.items():
    # calculate the cross validation scores for the current model
    scores = cross_val_score(model, X_train, Y_train, scoring="accuracy", cv=5, n_jobs=-1)
    
    # create a temporary DataFrame to hold the scores
    temp_df = pd.DataFrame({model_name: scores})

    # add the results to the main DataFrame
    scores_df = pd.concat([scores_df, temp_df], axis=1)

# calculate the mean score for each fold
scores_df = scores_df.T  # Transpose the DataFrame so that the models are the index
scores_df.columns = [f'Fold_{i+1}_Score' for i in range(scores_df.shape[1])]  # Rename columns to Fold_1_Score, Fold_2_Score, etc.
scores_df['Mean_Score'] = scores_df.mean(axis=1)  # Calculate the mean score for each model


In [None]:
# display the DataFrame
scores_df.sort_values(by="Mean_Score", ascending=False)

# Tuning

In [None]:

# param_grid_SVClass = {
#     'model__C': [1, 10, 100],  # regularization parameter
#     'model__gamma': [1, 0.1],  # kernel coefficient
#     'model__kernel': ['rbf', 'poly', 'sigmoid']  # type of hyperplane used to separate the data
# }

# param_grid_Gradient_boosting = {
#       # loss function to be optimized
#     'model__learning_rate': [0.2, 0.1, 0.01],  # learning rate shrinks the contribution of each tree
#     'model__n_estimators': [100, 200, 300],  # the number of boosting stages to perform
#     'model__max_depth': [None, 3, 7],  # maximum depth of the individual regression estimators
# }


# param_grid_Neighbors = {
#     'model__n_neighbors': [3, 7, 15],
#     'model__weights': ['uniform', 'distance'],
#     'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#     'model__leaf_size': [20, 30, 40],
# }


# param_grid_Random_forest = {
#     'model__n_estimators': [100, 200, 300],
#     'model__max_depth': [None, 10, 30],
#     'model__min_samples_split': [2, 5, 10],
#     'model__min_samples_leaf': [1, 2, 4],
#     'model__bootstrap': [True, False],
# }





In [None]:
best_params = {}

for model_name, model in models.items():
    param_grid_name = f"param_grid_{model_name}"  
    param_grid = globals()[param_grid_name]  # Access the parameter grid using the variable name
    
    grid_search = GridSearchCV(model, param_grid=param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, Y_train)  # Make sure you have defined X_train and Y_train
    
    best_params[model_name] = grid_search.best_params_

for model_name, params in best_params.items():
    print(f"Best parameters for {model_name}:", params)

# Bildong final model

In [None]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('SVC', SVC_model),
        ('K_near', K_model),
        ('Random_forest', random_forest),
        ('boosting_model', boosting_model)
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5, n_jobs=-1
)

In [62]:
scores = cross_val_score(stacking_clf, X_train, Y_train, scoring='accuracy', verbose=2, n_jobs=-1, cv = 5)
print(scores)
print(scores.mean())

[CV] END .................................................... total time= 3.8min
[CV] END .................................................... total time= 3.8min
[CV] END .................................................... total time= 3.8min
[CV] END .................................................... total time= 3.8min
[CV] END .................................................... total time= 3.8min
[0.34375    0.41163793 0.399137   0.39482201 0.35167206]
0.38020379979912955


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.8min finished


In [1]:
stacking_clf.fit(X_train, Y_train)

predictions_final = stacking_clf.predict(X_test)

predictions_final

NameError: name 'stacking_clf' is not defined