Import the required libraries

In [1]:
from jupyter_core.paths import jupyter_path
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import os
import glob
import matplotlib.pyplot as plt
from datetime import date
from datetime import datetime
from datetime import timedelta

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import lightgbm as lgb # Our ML library

In [3]:
#Cross validation libraries
from sklearn.model_selection import TimeSeriesSplit

In [4]:
import joblib

Read all CSVs with stocks data and append to one big file

In [5]:
#os.chdir("/Users/olegkazanskyi/Documents/GitHub/Trading/CSVs")
os.chdir("C:/Users/oleg.kazanskyi/Personal-oleg.kazanskyi/Trading Python/SP500_CSVs_01032023")
filepaths = [f for f in os.listdir("./") if f.endswith('.csv')]
df = pd.DataFrame()
for i in filepaths:
    iterate_df = pd.DataFrame()
    iterate_df = pd.read_csv(i, encoding= 'unicode_escape')
    iterate_df["stock"] = i[:-4]
    df = pd.concat([df,iterate_df])
#df = pd.concat(map(pd.read_csv, filepaths))

df = df[df.close.notna()]
os.chdir("C:/Users/oleg.kazanskyi/Personal-oleg.kazanskyi/Trading Python/ML_part/EOD")
df.to_csv("THE_FINAL_DATASET_2023.csv")

record the dataframe to speed up the future reading process

os.chdir("C:/Users/oleg.kazanskyi/Personal-oleg.kazanskyi/Trading Python/")
df.to_csv("THE_FINAL_DATASET_2023.csv")

In [6]:
df['YoY_DY'] = 100*df['DY'].pct_change(periods = -365)

In [7]:
df.shape

(905697, 61)

In [8]:
df.duplicated().sum()

0

Set up Date column as an index columns

In [9]:
df["date"] = pd.to_datetime(df["date"])

Let's check how much empty values we have by column

In [10]:
zeroes = df.isnull().sum()
print(zeroes[zeroes>0])
del zeroes

days_after_earnings_report    240298
LTDE                            4553
DE                              1208
DPR                           102881
Acc_Rec_Pay_Ration              1888
DY                            102881
PEG_Forward                     3140
PEG_Backwards                    119
EPS_surprise                  240298
EPS_YoY_Growth                240298
EPS_QoQ_frcst_diff            240298
EPS_1Y_exp_Change             254091
YoY_ROE                         7919
YoY_LTDE                        7919
YoY_DE                          7919
YoY_CR                          7919
YoY_GM                          7919
YoY_ROA                         7919
YoY_DPR                        99368
YoY_AR_Ration                   9354
YoY_ES                          7919
YoY_Piotroski                   7919
YoY_PE                          7919
YoY_PB                          7919
YoY_PEGF                        7919
YoY_PEGB                        7919
YoY_DY                           365
f

### Dealing with "EPS" column

There are many null values in the "current_ratio" column. Let's see the stocks where it happens and then decide what to do

In [11]:
df[df.EPS_surprise.isnull()].stock.unique()

array(['A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACGL', 'ACN',
       'ADBE', 'ADI', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP', 'AES', 'AFL',
       'AIG', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE',
       'AMAT', 'AMCR', 'AMD', 'AME', 'AMGN', 'AMP', 'AMT', 'AMZN', 'ANET',
       'ANSS', 'AON', 'AOS', 'APA', 'APD', 'APH', 'APTV', 'ARE', 'ATO',
       'ATVI', 'AVB', 'AVGO', 'AVY', 'AWK', 'AXP', 'AZO', 'BA', 'BAC',
       'BALL', 'BAX', 'BBWI', 'BBY', 'BDX', 'BEN', 'BF-B', 'BIIB', 'BIO',
       'BK', 'BKNG', 'BKR', 'BLK', 'BMY', 'BR', 'BRK-B', 'BRO', 'BSX',
       'BWA', 'BXP', 'C', 'CAG', 'CAH', 'CARR', 'CAT', 'CB', 'CBOE',
       'CBRE', 'CCI', 'CCL', 'CDAY', 'CDNS', 'CDW', 'CE', 'CF', 'CFG',
       'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', 'CL', 'CLX', 'CMA', 'CMCSA',
       'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP', 'COF', 'COO', 'COP',
       'COST', 'CPB', 'CPRT', 'CPT', 'CRL', 'CRM', 'CSCO', 'CSGP', 'CSX',
       'CTAS', 'CTLT', 'CTRA', 'CTSH', 'CTVA', 'CVS', 'CVX', 'CZ

After checking the original files, it's apparently the null values comes from the older data.

Before 2017 our API did not provide us info about earinings. 

We can drop the reocrds with 'nan' values

we will remove only 10k records 

In [12]:
df = df[df.EPS_surprise.notnull()]

Let's check the dataset again

In [13]:
def check_null_cols(df):
    zeroes = df.isnull().sum()
    print(zeroes[zeroes>0])
    
check_null_cols(df)   

LTDE                        4553
DE                          1208
DPR                        61224
Acc_Rec_Pay_Ration          1839
DY                         61224
PEG_Forward                 3140
PEG_Backwards                119
EPS_1Y_exp_Change          13793
YoY_ROE                     7919
YoY_LTDE                    7919
YoY_DE                      7919
YoY_CR                      7919
YoY_GM                      7919
YoY_ROA                     7919
YoY_DPR                    61096
YoY_AR_Ration               9305
YoY_ES                      7919
YoY_Piotroski               7919
YoY_PE                      7919
YoY_PB                      7919
YoY_PEGF                    7919
YoY_PEGB                    7919
YoY_DY                       365
future_15dprice_change        47
future_30dprice_change       227
future_60dprice_change      1239
future_90dprice_change      9610
future_120dprice_change    19580
future_150dprice_change    29568
VIX_DoD                      495
VIX_WoW   

### Dealing with the null values in the "dividends" column

About 30% of data is affected by null values in the dividends columns.
Let's check how many companies are in the list

In [14]:
print("Number of companies with zero values in dividends",len(df[df['DY'].isnull()].stock.unique()))

Number of companies with zero values in dividends 49


This companies do not pay dividends, we can replace payments to "0"

In [15]:
df['DY'].fillna(0, inplace=True)
df['YoY_DPR'].fillna(0, inplace=True)
df['DPR'].fillna(0, inplace=True)
df['YoY_DY'].fillna(0, inplace=True)                   

### Dealing with YoY

The YoY variables with null values are caused by YoY calculation of the rows without historical data.
Let's drop these

In [16]:
df = df[df.YoY_CR.notnull()]

Let's check the dataset again

In [17]:
check_null_cols(df)   

LTDE                        4553
DE                          1208
Acc_Rec_Pay_Ration          1839
PEG_Forward                 2175
PEG_Backwards                119
EPS_1Y_exp_Change          12828
YoY_AR_Ration               1386
future_15dprice_change        47
future_30dprice_change       227
future_60dprice_change      1239
future_90dprice_change      9610
future_120dprice_change    19580
future_150dprice_change    29568
VIX_DoD                      457
VIX_WoW                     2285
VIX_MoM                    10054
Fed_Balance_MoM             1428
Fed_Balance_YoY             1428
dtype: int64


In [18]:
df.shape

(657480, 61)

### Dealing with debt ratio

Let's check the companies that contain null values

In [19]:
df[df['DE'].isnull()].stock.unique()

array(['FFIV', 'FTNT'], dtype=object)

It seems these companies had zero debt for some quarters.

Let's replace with "0"

In [20]:
df['DE'].fillna(0, inplace=True)
df['LTDE'].fillna(0, inplace=True)

In [21]:
check_null_cols(df)   

Acc_Rec_Pay_Ration          1839
PEG_Forward                 2175
PEG_Backwards                119
EPS_1Y_exp_Change          12828
YoY_AR_Ration               1386
future_15dprice_change        47
future_30dprice_change       227
future_60dprice_change      1239
future_90dprice_change      9610
future_120dprice_change    19580
future_150dprice_change    29568
VIX_DoD                      457
VIX_WoW                     2285
VIX_MoM                    10054
Fed_Balance_MoM             1428
Fed_Balance_YoY             1428
dtype: int64


### Dealing with VIX

Let's remove null values for VIX as these are related to the historical calculations

In [22]:
df = df[df.VIX_MoM.notnull()]

In [23]:
check_null_cols(df) 

Acc_Rec_Pay_Ration          1795
PEG_Forward                 1953
PEG_Backwards                 61
EPS_1Y_exp_Change          12606
YoY_AR_Ration               1364
future_15dprice_change        47
future_30dprice_change       227
future_60dprice_change      1239
future_90dprice_change      9610
future_120dprice_change    19580
future_150dprice_change    29568
Fed_Balance_MoM             1406
Fed_Balance_YoY             1406
dtype: int64


### Dealing with Accounts Payable

In [24]:
df[df['Acc_Rec_Pay_Ration'].isnull()].stock.unique()

array(['CINF', 'MAA'], dtype=object)

It seems these companies had zero Accounts receivables for some quarters

In [25]:
df['Acc_Rec_Pay_Ration'].fillna(0, inplace=True)

### Dealing with PEG

In [26]:
df[df['PEG_Forward'].isnull()].stock.unique()

array(['LHX', 'PLD', 'SEDG'], dtype=object)

We can drop the rows as the effect would be unsignificant

In [27]:
df = df[df.PEG_Forward.notnull()]

In [28]:
df[df['PEG_Backwards'].isnull()].stock.unique()

array(['PKI', 'SYF'], dtype=object)

In [29]:
df = df[df.PEG_Backwards.notnull()]

In [30]:
df = df[df.EPS_1Y_exp_Change.notnull()]

In [31]:
check_null_cols(df) 

YoY_AR_Ration               1346
future_15dprice_change        11
future_30dprice_change        22
future_60dprice_change        44
future_90dprice_change        66
future_120dprice_change     9584
future_150dprice_change    19572
Fed_Balance_MoM             1406
Fed_Balance_YoY             1406
dtype: int64


### Dealing with Industry

In [32]:
df[df['sector'].isnull()].stock.unique()

array([], dtype=object)

Let's fill the sector and industry values for the GEN company

In [33]:
df.loc[df['stock'] == 'GEN','sector'] = 'Information Technology'
df.loc[df['stock'] == 'GEN','industry'] = 'Software & Services'

In [34]:
check_null_cols(df) 

YoY_AR_Ration               1346
future_15dprice_change        11
future_30dprice_change        22
future_60dprice_change        44
future_90dprice_change        66
future_120dprice_change     9584
future_150dprice_change    19572
Fed_Balance_MoM             1406
Fed_Balance_YoY             1406
dtype: int64


In [35]:
df = df[df.YoY_AR_Ration.notnull()]

### Working with data types

Now we can keep only the necessery columns

In [36]:
categoric_columns = df.select_dtypes(include='object').columns
for col in categoric_columns:
    if col == "stock":
        continue
    print(f"column {col}, data: \n {df[col].unique()}")

column days_after_earnings_report, data: 
 [45.0 44.0 43.0 42.0 41.0 38.0 37.0 36.0 35.0 34.0 31.0 30.0 29.0 28.0
 27.0 24.0 23.0 22.0 21.0 17.0 16.0 15.0 14.0 13.0 10.0 9.0 8.0 7.0 6.0
 3.0 2.0 1.0 0.0 83.0 80.0 79.0 78.0 77.0 76.0 73.0 72.0 71.0 70.0 69.0
 66.0 65.0 64.0 63.0 62.0 59.0 58.0 57.0 56.0 55.0 52.0 51.0 50.0 49.0
 48.0 20.0 90.0 87.0 86.0 85.0 84.0 88.0 81.0 74.0 67.0 60.0 53.0 46.0
 39.0 25.0 18.0 11.0 4.0 94.0 93.0 92.0 91.0 97.0 82.0 75.0 68.0 61.0 54.0
 47.0 40.0 33.0 32.0 26.0 19.0 12.0 5.0 100.0 99.0 98.0 96.0 89.0 102.0
 101.0 95.0 105.0 104.0 103.0 107.0 106.0 109.0 108.0 118.0 117.0 113.0
 112.0 111.0 110.0 125.0 124.0 123.0 120.0 119.0 116.0 115.0 114.0 121.0
 122.0 126.0 127.0 128.0 129.0 132.0 133.0 134.0 135.0 136.0 139.0 140.0
 141.0 142.0 143.0 146.0 147.0 148.0 149.0 150.0 153.0 154.0 155.0 156.0
 157.0 160.0 161.0 162.0 163.0 164.0 167.0 130.0 131.0 137.0 138.0 144.0]
column open, data: 
 [123.02 122.81 122.77 ... 91.635 84.3335 87.445]
column close, data

In [37]:
#Number of stocks per industry
df[['sector','industry','stock']].groupby('industry').stock.nunique()

industry
Automobiles & Components                           5
Banks                                             18
Capital Goods                                     48
Commercial & Professional Services                 9
Consumer Durables & Apparel                       13
Consumer Services                                 15
Diversified Financials                            26
Energy                                            23
Food & Staples Retailing                           4
Food, Beverage & Tobacco                          22
Health Care Equipment & Services                  36
Household & Personal Products                      6
Insurance                                         23
Materials                                         29
Media & Entertainment                             20
Pharmaceuticals, Biotechnology & Life Sciences    27
Real Estate                                       29
Retailing                                         22
Semiconductors & Semiconductor Equipm

In [38]:
#Number of stocks per sector
df[['sector','industry','stock']].groupby('sector').stock.nunique()

sector
Communication Services    24
Consumer Discretionary    55
Consumer Staples          32
Energy                    23
Financials                67
Health Care               63
Industrials               71
Information Technology    72
Materials                 29
Real Estate               29
Utilities                 29
Name: stock, dtype: int64

We can drop "Stocks" and "Industry" columns as there are too many unique values that block us from generalizing the data. 

In [39]:
#df.drop(["industry", "stock"], axis = 1, inplace = True)
#df.drop(["industry"], axis = 1, inplace = True)

Let's also check if there are any infinite numbers that can cause same trublesas nan

In [40]:
df.replace([np.inf, -np.inf], np.nan, inplace = True)

check_null_cols(df) 

EPS_YoY_Growth                36
EPS_QoQ_frcst_diff           101
EPS_1Y_exp_Change            206
future_15dprice_change        11
future_30dprice_change        22
future_60dprice_change        44
future_90dprice_change        66
future_120dprice_change     9563
future_150dprice_change    19529
Fed_Balance_MoM             1406
Fed_Balance_YoY             1406
dtype: int64


In [41]:
df = df[df.EPS_1Y_exp_Change.notnull() & df.EPS_YoY_Growth.notnull() & df.EPS_QoQ_frcst_diff.notnull()]

In [42]:
check_null_cols(df)

future_15dprice_change        11
future_30dprice_change        22
future_60dprice_change        44
future_90dprice_change        66
future_120dprice_change     9563
future_150dprice_change    19529
Fed_Balance_MoM             1406
Fed_Balance_YoY             1406
dtype: int64


In [49]:
df = df[df['Fed_Balance_YoY'].notnull()]

### Data Cleaning is Over!

## Trimming the data to avoid overfitting

 Our dataset have the range of dates that are very close to each other. 

The changes in trading value are rarely very different between today and yesterday. 

There are exemptions but mostly these are connected with big surprises, and in any case we would notice the change even if we track every other day or every 4th day.

It means that removing part of the dataset should help us in building more generalized model since we will be looking for trend but not for matching values.


Let's do this, let's remove 4/5 of the dataset.

That means we will keep only one day of data per week. It will help us to generalize the dataset

In [50]:
print("Old dataframe shape:", df.shape)
df_compact = df.iloc[::5, :]
print("New dataframe shape:", df_compact.shape)

Old dataframe shape: (631727, 61)
New dataframe shape: (126346, 61)


The final dataset is 131K rows long with 53 variables and 6 targets(price after 15 days, 30 days, 60 days, 90 days, 120 days and 150 days)

We can remove the bigger dataset, but let's save it to the file before doing so.

In [51]:
os.chdir("C:/Users/oleg.kazanskyi/Personal-oleg.kazanskyi/Trading Python/ML_Part/EOD")
df.to_csv("full_cleaned_dataframe_2023.csv", index = False, header = True)
del df

Lets' save the shorter version of the dataframe as well so we can get it faster when required

In [52]:
df_compact.to_csv("shorter_cleaned_dataframe_2023.csv", index = False, header = True)