# RSI prediction for next price

#### Imports

In [1]:
import sys
import math
import warnings

import psycopg2
import wrds
import gzip

import seaborn as sns
import os
import quandl
import json
import zipfile
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import functools
import requests
import io

import urllib.request
from urllib.error import HTTPError
from html_table_parser.parser import HTMLTableParser
from bs4 import BeautifulSoup
import re

import plotnine as p9
from plotnine import ggplot, scale_x_date, guides, guide_legend, geom_bar, scale_y_continuous, \
    scale_color_identity, geom_line, geom_point, labs, theme_minimal, theme, element_blank, element_text, \
        geom_ribbon, geom_hline, aes, scale_size_manual, scale_color_manual, ggtitle

from datetime import datetime
import datetime

import pandas as pd
import pandas_market_calendars as mcal
from pandas.plotting import autocorrelation_plot
import numpy as np
from numpy import cumsum, log, polyfit, sqrt, std, subtract
import scipy as sp
from scipy.stats import norm
import scipy.stats as stats

from statsmodels.tsa.stattools import coint
from statsmodels.graphics.tsaplots import plot_acf
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant

from collections import deque
from bisect import insort, bisect_left
from itertools import islice


### Retrieve as Variable

In [61]:
csv_file_path = 'spy_tickerdata.csv'
predictdata = pd.read_csv(csv_file_path)

In [62]:
predictdata['date'] = pd.to_datetime(predictdata['date'])
predictdata = predictdata.sort_values(by='date')

In [63]:
display(predictdata)

Unnamed: 0,ticker,date,open,high,low,close,volume,dividend,split,adj_open,adj_high,adj_low,adj_close,adj_volume
338,SPY,1993-01-29,43.9687,43.9687,43.7500,43.9375,1003200.0,0.0,1.0,24.839471,24.839471,24.715919,24.821845,1003200.0
339,SPY,1993-02-01,43.9687,44.2500,43.9687,44.2500,480500.0,0.0,1.0,24.839471,24.998387,24.839471,24.998387,480500.0
340,SPY,1993-02-02,44.2187,44.3750,44.1250,44.3437,201300.0,0.0,1.0,24.980704,25.069004,24.927770,25.051321,201300.0
341,SPY,1993-02-03,44.4062,44.8437,44.3750,44.8125,529400.0,0.0,1.0,25.086630,25.333789,25.069004,25.316163,529400.0
342,SPY,1993-02-04,44.9687,45.0937,44.4687,45.0000,531500.0,0.0,1.0,25.404406,25.475023,25.121938,25.422088,531500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,SPY,2024-02-12,501.1700,503.5000,500.2400,500.9800,55089631.0,0.0,1.0,501.170000,503.500000,500.240000,500.980000,55089631.0
300,SPY,2024-02-13,494.5300,495.8500,490.7150,494.0800,110198097.0,0.0,1.0,494.530000,495.850000,490.715000,494.080000,110198097.0
299,SPY,2024-02-14,496.7900,499.0700,494.4000,498.5700,68046116.0,0.0,1.0,496.790000,499.070000,494.400000,498.570000,68046116.0
298,SPY,2024-02-15,499.2900,502.2000,498.7950,502.0100,60114914.0,0.0,1.0,499.290000,502.200000,498.795000,502.010000,60114914.0


We'll take a few simple approaches to determine a flag to use put or call options:
* RSI
* Regression on return

In [64]:
df = predictdata
df['price_change'] = df['close'].diff()
df['gain'] = np.where(df['price_change'] > 0, df['price_change'], 0)
df['loss'] = np.where(df['price_change'] < 0, -df['price_change'], 0)
window_length = 10
df['avg_gain'] = df['gain'].rolling(window=window_length, min_periods=1).mean()
df['avg_loss'] = df['loss'].rolling(window=window_length, min_periods=1).mean()
df['RS'] = df['avg_gain'] / df['avg_loss']
df['RSI'] = 100 - (100 / (1 + df['RS']))

In [68]:
display(df)

Unnamed: 0,ticker,date,open,high,low,close,volume,dividend,split,adj_open,...,adj_low,adj_close,adj_volume,price_change,gain,loss,avg_gain,avg_loss,RS,RSI
338,SPY,1993-01-29,43.9687,43.9687,43.7500,43.9375,1003200.0,0.0,1.0,24.839471,...,24.715919,24.821845,1003200.0,,0.0000,0.00,0.00000,0.000,,
339,SPY,1993-02-01,43.9687,44.2500,43.9687,44.2500,480500.0,0.0,1.0,24.839471,...,24.839471,24.998387,480500.0,0.3125,0.3125,0.00,0.15625,0.000,inf,100.000000
340,SPY,1993-02-02,44.2187,44.3750,44.1250,44.3437,201300.0,0.0,1.0,24.980704,...,24.927770,25.051321,201300.0,0.0937,0.0937,0.00,0.13540,0.000,inf,100.000000
341,SPY,1993-02-03,44.4062,44.8437,44.3750,44.8125,529400.0,0.0,1.0,25.086630,...,25.069004,25.316163,529400.0,0.4688,0.4688,0.00,0.21875,0.000,inf,100.000000
342,SPY,1993-02-04,44.9687,45.0937,44.4687,45.0000,531500.0,0.0,1.0,25.404406,...,25.121938,25.422088,531500.0,0.1875,0.1875,0.00,0.21250,0.000,inf,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,SPY,2024-02-12,501.1700,503.5000,500.2400,500.9800,55089631.0,0.0,1.0,501.170000,...,500.240000,500.980000,55089631.0,-0.2200,0.0000,0.22,2.01200,1.041,1.932757,65.902391
300,SPY,2024-02-13,494.5300,495.8500,490.7150,494.0800,110198097.0,0.0,1.0,494.530000,...,490.715000,494.080000,110198097.0,-6.9000,0.0000,6.90,2.01200,1.693,1.188423,54.304993
299,SPY,2024-02-14,496.7900,499.0700,494.4000,498.5700,68046116.0,0.0,1.0,496.790000,...,494.400000,498.570000,68046116.0,4.4900,4.4900,0.00,2.46100,0.892,2.758969,73.396958
298,SPY,2024-02-15,499.2900,502.2000,498.7950,502.0100,60114914.0,0.0,1.0,499.290000,...,498.795000,502.010000,60114914.0,3.4400,3.4400,0.00,2.17300,0.892,2.436099,70.897227


In [75]:
df['date'] = pd.to_datetime(df['date'])
df_filtered = df[(df['date'] >= '2018-01-01') & (df['date'] <= '2023-02-28')]

df_filtered['future_close'] = df_filtered['close'].shift(-20)
df_filtered['actual_future_direction'] = np.where(df_filtered['future_close'] > df_filtered['close'], 1, -1)

window = 5
df_filtered['rsi_slope'] = df_filtered['RSI'].diff(periods=window)

df_filtered['rsi_trend_direction'] = np.where(df_filtered['rsi_slope'] > 0, 1, -1)
df_filtered['correct_prediction'] = df_filtered['rsi_trend_direction'] == df_filtered['actual_future_direction']

correct_proportion = df_filtered['correct_prediction'].mean()
print(f"Proportion of correct RSI trend-based predictions: {correct_proportion:.2f}")

Proportion of correct RSI trend-based predictions: 0.52


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

And regression for 30 days is clearly not very different

In [55]:
from sklearn.metrics import mean_squared_error

df['date'] = pd.to_datetime(df['date'])
df['returns'] = df['close'].pct_change()
df['future_returns_20d'] = df['returns'].shift(-20)
for lag in range(1, 6): 
    df[f'returns_lag_{lag * 20}'] = df['returns'].shift(lag * 20)
df_clean = df.dropna()
train_df = df_clean[(df_clean['date'] >= '2010-01-01') & (df_clean['date'] <= '2018-12-31')]

X_train = train_df[[f'returns_lag_{lag * 20}' for lag in range(1, 6)]]
y_train = train_df['future_returns_20d']
X_train = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train).fit()

test_df = df_clean[(df_clean['date'] >= '2019-01-01') & (df_clean['date'] <= '2023-02-28')].reset_index(drop=True)

X_test = test_df[[f'returns_lag_{lag * 20}' for lag in range(1, 6)]]
X_test = sm.add_constant(X_test)

y_pred = model.predict(X_test)
predicted_direction = np.where(y_pred > 0, 1, -1)

test_df['predicted_direction'] = predicted_direction

test_df['actual_future_direction'] = np.where(test_df['future_returns_20d'] > 0, 1, -1)

correct_predictions = (test_df['actual_future_direction'] == test_df['predicted_direction']).mean()

print(f"Proportion of correct direction predictions: {correct_predictions:.2f}")
mse = mean_squared_error(test_df['future_returns_20d'], y_pred)
print(f"Mean Squared Error: {mse}")
print(model.summary())

Proportion of correct direction predictions: 0.51
Mean Squared Error: 0.00019940735121004645
                            OLS Regression Results                            
Dep. Variable:     future_returns_20d   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9668
Date:                Tue, 20 Feb 2024   Prob (F-statistic):              0.437
Time:                        14:48:37   Log-Likelihood:                 7345.7
No. Observations:                2264   AIC:                        -1.468e+04
Df Residuals:                    2258   BIC:                        -1.465e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

We can test some random technical composite signals, but I expect them all to be trash

In [76]:
df['ma10'] = df['close'].rolling(window=10).mean()
df['ma20'] = df['close'].rolling(window=20).mean()

df['macd'] = df['close'].ewm(span=12, adjust=False).mean() - df['close'].ewm(span=26, adjust=False).mean()
df['signal_line'] = df['macd'].ewm(span=9, adjust=False).mean()

df['ma_signal'] = np.where(df['ma10'] > df['ma20'], 1, -1)  # 1 for Bullish, -1 for Bearish
df['macd_signal'] = np.where(df['macd'] > df['signal_line'], 1, -1)  # 1 for Bullish, -1 for Bearish

df['composite_signal'] = (df['ma_signal'] + df['macd_signal'] + np.where(df['RSI'] > 50, 1, -1)) / 3
df['directional_indicator'] = np.where(df['composite_signal'] > 0, 1, -1)

df['future_close'] = df['close'].shift(-20)
df['actual_future_direction'] = np.where(df['future_close'] > df['close'], 1, -1)

df_filtered = df[(df['date'] >= '2018-01-01') & (df['date'] <= '2023-02-28')]

df_filtered['correct_prediction'] = df_filtered['directional_indicator'] == df_filtered['actual_future_direction']
correct_proportion = df_filtered['correct_prediction'].mean()
print(f"Proportion of correct signals: {correct_proportion:.2f}")


Proportion of correct signals: 0.53


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unsurprisingly they all rather suck - we'll likely just look at simulating puts and calls separately.