In [21]:
import os
import glob
import numpy as np
import pandas as pd
import requests
import time
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error



# for visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import plotly.graph_objects as go
import seaborn as sns


In [22]:
spx_test = r"C:\Users\Nagham\Investor\Data\Test\spx_d_test2.csv"




def read_csv_file(file_path: str, delimiter: str = '\t') -> pd.DataFrame:
    """
    Read a TXT file and convert it to tabular data.

    Parameters:
        file_path (str): The path to the TXT file.
        delimiter (str): The delimiter used in the TXT file. Default is '\t' (tab).

    Returns:
        pandas.DataFrame: The tabular data.
    """
    try:
        # Read the TXT file into a pandas DataFrame
        df = pd.read_csv(file_path, delimiter=delimiter)
        return df
    except Exception as e:
        print(f"Error reading TXT file: {e}")
        return None
    
    
df_spx_t = read_csv_file(spx_test, ",")# Extract Test data

df_spx_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    333 non-null    object 
 1   Open    333 non-null    float64
 2   High    333 non-null    float64
 3   Low     333 non-null    float64
 4   Close   333 non-null    float64
 5   Volume  333 non-null    int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 15.7+ KB


In [23]:
# Renaming the columns by removing the '<' and '>' characters
new_column_names = {col: col.strip('<>').upper() for col in df_spx_t.columns}
df_spx_t = df_spx_t.rename(columns=new_column_names)

In [24]:
df_t = df_spx_t.copy()

In [25]:
# Calculate discrete returns
df_t['discrete_return'] = (df_t['CLOSE'] - df_t['CLOSE'].shift(1)) / df_t['CLOSE'].shift(1)
#df['discrete_return'] = np.log(df['OPEN']/df['OPEN'].shift(1)) # opposed to closing prices, to avoid look-ahead bias.

print(df_t.describe())
df_t.head()

              OPEN         HIGH          LOW        CLOSE        VOLUME  \
count   333.000000   333.000000   333.000000   333.000000  3.330000e+02   
mean   4390.580030  4412.673063  4369.318048  4393.036426  2.418380e+09   
std     368.720803   365.995689   372.849981   369.990757  5.166757e+08   
min    3805.450000  3838.240000  3764.490000  3783.220000  8.917203e+08   
25%    4105.350000  4132.960000  4086.940000  4109.310000  2.165980e+09   
50%    4360.490000  4383.330000  4341.340000  4369.010000  2.325178e+09   
75%    4565.750000  4587.640000  4552.800000  4567.800000  2.573674e+09   
max    5256.200000  5264.850000  5245.820000  5254.350000  5.743241e+09   

       discrete_return  
count       332.000000  
mean          0.000793  
std           0.008273  
min          -0.024922  
25%          -0.004071  
50%           0.000704  
75%           0.006274  
max           0.022841  


Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOLUME,discrete_return
0,2022-12-01,4087.14,4100.51,4050.87,4076.57,2316143021,
1,2022-12-02,4040.17,4080.48,4026.63,4071.7,2073993715,-0.001195
2,2022-12-05,4052.02,4052.45,3984.49,3998.84,2283039476,-0.017894
3,2022-12-06,3996.63,4001.51,3918.39,3941.26,2363290230,-0.014399
4,2022-12-07,3933.28,3957.57,3922.68,3933.92,2261360347,-0.001862


In [26]:
#Weighted Moving Average
def calculate_wma(data, window):
    weights = np.arange(1, window + 1)
    wma = data.rolling(window=window).apply(lambda prices: np.dot(prices, weights) / weights.sum(), raw=True)
    return wma

In [27]:
# Define the window size for WMA calculation
window_size = 15

# Calculate WMA with the specified window size
df_t['WMA'] = calculate_wma(df_t['CLOSE'], window_size)

df_t['WMA_signal'] = df_t['CLOSE'] - df_t['WMA']

In [28]:
#Need to validate this code 
#Relative Strength Index (RSI)
window_size = 14

# Calculate price changes
Price_Change = df_t['CLOSE'].diff()

# Calculate gains and losses
Gain = np.where(Price_Change > 0, Price_Change, 0)
Loss = np.where(Price_Change < 0, abs(Price_Change), 0)

# Calculate average gain and average loss over the period
Avg_Gain = pd.Series(Gain).rolling(window=window_size, min_periods=1).mean()
Avg_Loss = pd.Series(Loss).rolling(window=window_size, min_periods=1).mean()

# Calculate Relative Strength (RS)
RS = Avg_Gain / Avg_Loss

# Calculate RSI
df_t['RSI'] = 100 - (100 / (1 + RS))

In [29]:
# Define period for WPR calculation
window_size = 14

# Calculate highest high and lowest low over the period
Highest_High = df_t['HIGH'].rolling(window=window_size).max()
Lowest_Low = df_t['LOW'].rolling(window=window_size).min()

# Calculate Williams %R
df_t['WPR'] = (Highest_High - df_t['CLOSE']) / (Highest_High - Lowest_Low) * -100


In [30]:
def calculate_bollinger_bands(df, window=20, num_std_dev=2): #20,2 Typiclly used 
    # Calculate the rolling mean and standard deviation
    rolling_mean = df['Typical Price'].rolling(window=window).mean()
    rolling_std = df['Typical Price'].rolling(window=window).std()
    
    # Calculate upper and lower bands
    upper_band = rolling_mean + (rolling_std * num_std_dev)
    lower_band = rolling_mean - (rolling_std * num_std_dev)
    
    return upper_band, lower_band

In [31]:
# Create a new column for the closing price
df_t['Typical Price'] = (df_t['LOW'] + df_t['HIGH'] + df_t['CLOSE']) / 3.0

# Calculate Bollinger Bands
upper_band, lower_band = calculate_bollinger_bands(df_t)

# Add the diff to the DataFrame
df_t['Bollinger Diff'] = upper_band - lower_band

In [32]:
#Moving Average Convergence Divergence (MACD)
# Define periods for short-term and long-term EMAs
short_period = 12
long_period = 26
signal_line_span = 9

# Calculate short-term EMA
short_ema = df_t['CLOSE'].ewm(span=short_period, adjust=False).mean()

# Calculate long-term EMA
long_ema = df_t['CLOSE'].ewm(span=long_period, adjust=False).mean()

# Calculate MACD line
macd_line = short_ema - long_ema

# Calculate Signal line (typically 9-period EMA of MACD line)
signal_line = macd_line.ewm(span=signal_line_span, adjust=False).mean()

# Calculate MACD signal
df_t['macd_signal'] = macd_line - signal_line

In [33]:
# Transformation Function
# Technical analysis indicators need to be rescaled before being fed to the models.
# The process is conducted using a version of min-max normalization technique which produces outputs in range from ‐1 to 1.
# This technique was chosen for two reasons: it is intuitive as the machine learning models produce output 
# variable that is also ranging from ‐1 to 1 and because it causes the input data to be more comparable. 
# X'(t) = (X(t) - min(x)) / (max(x) - min(x))*2 -1

def feature_transform(x):
    max_x = np.max(x)
    min_x = np.min(x)

    x_transformed = (x - min_x)/(max_x - min_x)*2 -1

    return x_transformed

In [34]:
df_t = df_t.drop(df_t.index[0:20])

In [35]:
#feature transform
df_t.iloc[:, 7:] = df_t.iloc[:, 7:].apply(feature_transform)


In [36]:
df_t.drop(df_t.columns[1:6], axis=1, inplace=True)

In [37]:
df_t.drop('Typical Price', axis=1, inplace=True)

In [38]:
df_t.drop('WMA', axis=1, inplace=True)

In [39]:
df_t

Unnamed: 0,DATE,discrete_return,WMA_signal,RSI,WPR,Bollinger Diff,macd_signal
20,2022-12-30,-0.002541,-0.119744,-0.378295,-0.559631,0.016738,-0.256911
21,2023-01-03,-0.004001,-0.185316,-0.584572,-0.651418,-0.102502,-0.192345
22,2023-01-04,0.007539,0.051827,-0.585474,-0.392865,-0.182391,-0.059119
23,2023-01-05,-0.011646,-0.239404,-0.620912,-0.555608,-0.187572,-0.070091
24,2023-01-06,0.022841,0.379125,-0.148486,0.845135,-0.210933,0.151149
...,...,...,...,...,...,...,...
328,2024-03-25,-0.003055,0.209070,0.356758,0.495149,-0.533385,-0.074884
329,2024-03-26,-0.002800,0.058359,0.220912,0.322311,-0.548352,-0.163643
330,2024-03-27,0.008631,0.326966,0.199499,0.853602,-0.560265,-0.127898
331,2024-03-28,0.001117,0.301122,0.390977,0.881245,-0.541937,-0.111416


In [40]:
def get_test_data():
    return (df_t)