In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Plan: 
# Use 10 tickers minute data (S&P 500 stocks chosen arbitrarily)
# Use 5 technical indicators (SMA, EMA, MACD, RSI, Stochastic Oscillator)
# Feed data into ARIMA model, being sure to use only first 2000 minutes of data
# Use ARIMA model to predict final 400 minutes of data
# buy best predicted stock, short worst predicted stock

Downloading Data

In [3]:
# 10 tickers randomly selected
tickers = ['AAPL', 'MSFT', 'TSLA', 'GME', 'NVDA', 'KO', 'AMZN', 'META', 'DIS', 'GOOG']

data = yf.download(tickers, period='7d', interval='1m', group_by='ticker')



[*********************100%%**********************]  10 of 10 completed


Unnamed: 0_level_0,MSFT,MSFT,MSFT,MSFT,MSFT,MSFT,TSLA,TSLA,TSLA,TSLA,...,META,META,META,META,GME,GME,GME,GME,GME,GME
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2023-11-30 09:30:00-05:00,378.489990,379.230011,377.654999,379.225006,379.225006,705278.0,245.139999,245.220001,243.160004,243.449997,...,331.239990,331.480011,331.480011,1503235,16.290001,16.3300,16.0700,16.110001,16.110001,867996
2023-11-30 09:31:00-05:00,379.234009,380.089996,379.190002,379.899994,379.899994,133943.0,243.398605,243.679993,242.270004,242.548599,...,331.480011,332.890015,332.890015,99114,16.110001,16.1700,15.8600,15.935800,15.935800,153389
2023-11-30 09:32:00-05:00,379.910004,380.059998,379.519989,379.679993,379.679993,104588.0,242.520004,242.949997,241.759995,242.174896,...,332.774994,332.839996,332.839996,81865,15.930000,15.9401,15.6200,15.706600,15.706600,216174
2023-11-30 09:33:00-05:00,379.670013,379.899994,379.299988,379.519989,379.519989,151819.0,242.199799,242.580002,241.660004,241.699997,...,332.779999,333.070007,333.070007,51902,15.699400,15.8400,15.6300,15.830000,15.830000,142621
2023-11-30 09:34:00-05:00,379.540009,379.559998,378.858704,378.970001,378.970001,109010.0,241.695099,242.600006,241.690002,242.535400,...,332.480011,332.533295,332.533295,34254,15.820000,15.8464,15.7400,15.794900,15.794900,101610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-08 12:35:00-05:00,372.589996,372.645203,372.500000,372.500793,372.500793,19417.0,241.059998,241.110001,240.746506,240.869904,...,329.950104,330.029999,330.029999,14451,15.865000,15.8700,15.8500,15.855000,15.855000,17995
2023-12-08 12:36:00-05:00,372.579987,372.589996,372.299988,372.309998,372.309998,25292.0,240.877197,240.937500,240.690002,240.710007,...,329.940002,329.947205,329.947205,9662,15.855100,15.9100,15.8551,15.900200,15.900200,8853
2023-12-08 12:37:00-05:00,372.320007,372.489899,372.320007,372.420013,372.420013,18069.0,240.755005,241.049896,240.733597,241.048996,...,329.886993,329.970001,329.970001,8119,15.905000,15.9400,15.9050,15.940000,15.940000,3879
2023-12-08 12:38:00-05:00,372.464996,372.809998,372.440002,372.809998,372.809998,21723.0,241.039993,241.169998,241.009995,241.110001,...,329.989990,330.200012,330.200012,16060,15.940000,15.9400,15.9200,15.925000,15.925000,7034


Technical indicator functions

In [4]:
def RSI(df, ticker):
    df = df.copy()  # create a copy of the dataframe to avoid SettingWithCopyWarning
    df.loc[:, 'change'] = df['Adj Close'].diff()
    df.loc[:, 'gain'] = np.where(df['change']>=0, df['change'], 0)
    df.loc[:, 'loss'] = np.where(df['change']<0, abs(df['change']), 0)
    df.loc[:, 'avg_gain'] = df['gain'].ewm(com=13, min_periods=14).mean()
    df.loc[:, 'avg_loss'] = df['loss'].ewm(com=13, min_periods=14).mean()
    df.loc[:, 'rs'] = df['avg_gain']/df['avg_loss']
    df.loc[:, 'rsi'] = 100 - (100/(1+df['rs']))
    return df

def SMA(df, ticker):
    df = df.copy()
    df.loc[:, 'SMA'] = df['Adj Close'].rolling(20).mean()
    return df

def StochasticOscillator(df, ticker):
    df = df.copy()
    df.loc[:, '14-high'] = df['High'].rolling(14).max()
    df.loc[:, '14-low'] = df['Low'].rolling(14).min()
    df.loc[:, '%K'] = (df['Adj Close'] - df['14-low'])*100/(df['14-high'] - df['14-low'])
    df.loc[:, '%D'] = df['%K'].rolling(3).mean()
    return df

def MACD(df, ticker):
    df = df.copy()
    df.loc[:, '12-ema'] = df['Adj Close'].ewm(span=12, adjust=False).mean()
    df.loc[:, '26-ema'] = df['Adj Close'].ewm(span=26, adjust=False).mean()
    df.loc[:, 'MACD'] = df['12-ema'] - df['26-ema']
    df.loc[:, 'signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    return df

def BolliBands(df, ticker):
    df = df.copy()
    df.loc[:, 'SMA'] = df['Adj Close'].rolling(20).mean()
    df.loc[:, 'std'] = df['Adj Close'].rolling(20).std()
    df.loc[:, 'upper'] = df['SMA'] + 2*df['std']
    df.loc[:, 'lower'] = df['SMA'] - 2*df['std']
    return df

Applying functions 

In [5]:
rsi_data = {}

for ticker in tickers:
    rsi = RSI(data[ticker], ticker)
    rsi.columns = [f"{ticker}_{col}" for col in rsi.columns]
    rsi_data[ticker] = rsi
combined_rsi = pd.concat(rsi_data.values(), axis=1)
combined_rsi = combined_rsi.filter(regex='_rsi$')

SMA_data = {}

for ticker in tickers:
    sma = SMA(data[ticker], ticker)
    sma.columns = [f"{ticker}_{col}" for col in sma.columns]
    SMA_data[ticker] = sma
combined_sma = pd.concat(SMA_data.values(), axis=1)
combined_sma = combined_sma.filter(regex='_SMA$')

stochastic_data = {}

for ticker in tickers:
    stochastic = StochasticOscillator(data[ticker], ticker)
    stochastic.columns = [f"{ticker}_{col}" for col in stochastic.columns]
    stochastic_data[ticker] = stochastic
combined_stochastic = pd.concat(stochastic_data.values(), axis=1)
combined_stochastic = combined_stochastic.filter(regex='%K$')

MACD_data = {}

for ticker in tickers:
    macd_df = MACD(data[ticker], ticker)
    macd_df.columns = [f"{ticker}_{col}" for col in macd_df.columns]
    MACD_data[ticker] = macd_df

combined_MACD = pd.concat(MACD_data.values(), axis=1)
combined_MACD = combined_MACD.filter(regex='MACD$')

BolliBands_data = {}

for ticker in tickers:
    bolli_df = BolliBands(data[ticker], ticker)
    bolli_df.columns = [f"{ticker}_{col}" for col in bolli_df.columns]
    BolliBands_data[ticker] = bolli_df
    
combined_BolliBands = pd.concat(BolliBands_data.values(), axis=1)
combined_BolliBands = combined_BolliBands.filter(regex='upper$')

# combine all indicators into one dataframe on Datetime
combined = pd.concat([combined_rsi, combined_sma, combined_stochastic, combined_MACD, combined_BolliBands], axis=1)
combined = combined.reset_index()

# add the adj close and volume columns of each ticker to the combined dataframe with the prefix Ticker_
for ticker in tickers:
    combined = pd.merge(combined, data[ticker][['Adj Close', 'Volume']].reset_index().rename(columns={'Adj Close': f'{ticker}_Adj Close', 'Volume': f'{ticker}_Volume'}), on='Datetime', how='left')
combined = combined.set_index('Datetime')
combined = combined.dropna()

Using ARIMA model for prediction

In [6]:
def predict_last_minute(ticker):
    train_data = combined[f'{ticker}_Adj Close'].iloc[:2000]
    test_data = combined[f'{ticker}_Adj Close'].iloc[2000:]
    model = ARIMA(train_data, order=(10, 2, 5))
    model_fit = model.fit()
    prediction = model_fit.forecast(steps=len(test_data))

    return prediction

# make predictions for each ticker
all_predictions = {}
for ticker in tickers:
    features = combined.drop(columns=[f'{ticker}_Adj Close', f'{ticker}_Volume'])
    target = combined[f'{ticker}_Adj Close']
    train_features = features.iloc[:2000]
    train_target = target.iloc[:2000]
    test_features = features.iloc[2000:]
    test_target = target.iloc[2000:]
    model = ARIMA(train_target, order=(10, 2, 5))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=len(test_features))
    all_predictions[ticker] = predictions


# Calculate the percentage change from the first prediction to the last for each ticker
percentage_changes = {}
for ticker in tickers:
    try:
        predictions = all_predictions[ticker]
        if len(predictions) > 0:
            first_prediction = predictions.iloc[0]  
            last_prediction = predictions.iloc[-1]

            percentage_change = ((last_prediction - first_prediction) / first_prediction) * 100
            percentage_changes[ticker] = percentage_change
        else:
            print(f'No predictions for {ticker}')
    except Exception as e:
        print(f'Error calculating percentage change for {ticker}: {e}')

highest_change_ticker = max(percentage_changes, key=percentage_changes.get)
lowest_change_ticker = min(percentage_changes, key=percentage_changes.get)
print(f'Ticker with highest predicted change: {highest_change_ticker} ({percentage_changes[highest_change_ticker]}%)')
print(f'Ticker with lowest predicted change: {lowest_change_ticker} ({percentage_changes[lowest_change_ticker]}%)')

Ticker with highest predicted change: GOOG (3.3007626791481477%)
Ticker with lowest predicted change: MSFT (-0.5499362528624305%)


Results

In [7]:
# buy the stock with the highest predicted change and short the stock with the lowest predicted change
# hold for 6 hours until close

buy_start = combined[f'{highest_change_ticker}_Adj Close'].iloc[2000]
buy_end = combined[f'{highest_change_ticker}_Adj Close'].iloc[-1]
short_start = combined[f'{lowest_change_ticker}_Adj Close'].iloc[2000]
short_end = combined[f'{lowest_change_ticker}_Adj Close'].iloc[-1]
buy_profit = ((buy_end - buy_start) / buy_start) * 100
short_profit = ((short_start - short_end) / short_start) * 100

print(f'Profit from buying {highest_change_ticker}: {buy_profit:.2f}%')
print(f'Profit from shorting {lowest_change_ticker}: {short_profit:.2f}%')


# performance metrics

# sharpe
buy_returns = combined[f'{highest_change_ticker}_Adj Close'].pct_change().iloc[2000:]
short_returns = combined[f'{lowest_change_ticker}_Adj Close'].pct_change().iloc[2000:]
buy_sharpe = buy_returns.mean() / buy_returns.std()
short_sharpe = short_returns.mean() / short_returns.std()
print(f'Sharpe ratio for buying {highest_change_ticker}: {buy_sharpe:.2f}')
print(f'Sharpe ratio for shorting {lowest_change_ticker}: {short_sharpe:.2f}')

# portfolio turnover ratio
total_purchases = buy_returns[buy_returns > 0].sum()
total_sales = short_returns[short_returns < 0].sum()
initial_portfolio_value = buy_start + short_start
final_portfolio_value = buy_end + short_end
average_portfolio_assets = (initial_portfolio_value + final_portfolio_value) / 2
portfolio_turnover_ratio = (total_purchases + abs(total_sales)) / average_portfolio_assets * 100
print(f'Portfolio turnover ratio: {portfolio_turnover_ratio:.2f}')


# Conclusion: the strategy was not profitable in this very short time period
# maybe more data is needed to train the model, or this model is not suitable for this problem
# the net sharpe ratio for buying the stock with the highest predicted change resulted in a negative sharpe ratio
# but the sharpe ratio for shorting the stock with the lowest predicted change resulted in a positive sharpe ratio
# this means there might be some value in this strategy, but more data is needed to confirm this
# but the sharpe values are so low that there likely is no value in this strategy
# the portfolio turnover is obviously very low because ther is only one buy and one sell
# the portfolio turnover ratio is not a good indicator for this strategy


Profit from buying GOOG: -1.39%
Profit from shorting MSFT: -1.08%
Sharpe ratio for buying GOOG: -0.04
Sharpe ratio for shorting MSFT: 0.05
Portfolio turnover ratio: 0.04
