# Get stock market data & clean data

In [1]:
%matplotlib inline
import yfinance as yf
import quantstats as qs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# extend pandas functionality with metrics, etc.
qs.extend_pandas()

##

# Data collection

In [2]:
tickerSymbol = ["AMZN"]

for stock in tickerSymbol:
    
    stock_info = yf.Ticker(stock)
    
    df_stock = stock_info.history(period="1D", 
                                  start="2016-01-01")
    
    df_close = df_stock[["Close"]]
        
    
# Rename col header to ticker symbol
df_close = df_close.set_axis(tickerSymbol, axis=1)

display(df_close)

# Save Close prices to csv
df_close.to_csv("ClosePrices.csv")

Unnamed: 0_level_0,AMZN
Date,Unnamed: 1_level_1
2015-12-31,675.890015
2016-01-04,636.989990
2016-01-05,633.789978
2016-01-06,632.650024
2016-01-07,607.940002
...,...
2021-08-23,3265.870117
2021-08-24,3305.780029
2021-08-25,3299.179932
2021-08-26,3316.000000


##

# Data Cleaning #1

In [4]:
# Drop duplicates
df_close.drop_duplicates(inplace=False)

# Drop NA and infinite values
df_close = df_close.replace(-np.inf, np.nan).dropna()
df_close.head()

Unnamed: 0_level_0,AMZN
Date,Unnamed: 1_level_1
2015-12-31,675.890015
2016-01-04,636.98999
2016-01-05,633.789978
2016-01-06,632.650024
2016-01-07,607.940002


In [5]:
# Count nulls/checking final df
df_close.isnull().sum()

AMZN    0
dtype: int64

In [6]:
# Define time period to predict
pred_period = 1

# Compute the pct_change for 1 min 
df_returns = df_close.pct_change(pred_period)

df_returns.head()

Unnamed: 0_level_0,AMZN
Date,Unnamed: 1_level_1
2015-12-31,
2016-01-04,-0.057554
2016-01-05,-0.005024
2016-01-06,-0.001799
2016-01-07,-0.039058


In [7]:
# Shift the returns to convert them to forward returns i.e,. today's return = return from 2-days ahead
df_returns_forward = df_returns.shift(-1)

# Preview the DataFrame
df_returns_forward.head()

Unnamed: 0_level_0,AMZN
Date,Unnamed: 1_level_1
2015-12-31,-0.057554
2016-01-04,-0.005024
2016-01-05,-0.001799
2016-01-06,-0.039058
2016-01-07,-0.001464


In [8]:
# Place stock data on top of each other along the length of the df
df_returns = pd.DataFrame(df_returns.unstack())

# Rename the column to returns
name = f'{pred_period}_Day_returns'

df_returns.rename(columns={0: name},
                  inplace = True)

# Reset the index 
df_returns.reset_index(inplace=True)

In [9]:
df_returns.head()

Unnamed: 0,level_0,Date,1_Day_returns
0,AMZN,2015-12-31,
1,AMZN,2016-01-04,-0.057554
2,AMZN,2016-01-05,-0.005024
3,AMZN,2016-01-06,-0.001799
4,AMZN,2016-01-07,-0.039058


##

# Shift data to create future close prices

In [10]:
# Create 5 and 10-day periods to predict
periods_to_pred = [5, 10]

for i in periods_to_pred:   
    
    # Calc percentage change for each time period
    returns_temp = df_close.pct_change(i)
    
    # Stack returns  
    returns_temp = pd.DataFrame(returns_temp.unstack())
    
    # Rename column 
    name = f'{i}_Day_returns'
    returns_temp.rename(columns={0:name},
                        inplace=True)
    
    # Reset index 
    returns_temp.reset_index(inplace=True)
    
    # Merge df 
    df_returns = pd.merge(df_returns,
                          returns_temp,
                          left_on=['level_0', 'Date'],
                          right_on=['level_0', 'Date'],
                          how='left', 
                          suffixes=('_original', 'right'))

In [11]:
# Check data
df_returns.tail(20)

Unnamed: 0,level_0,Date,1_Day_returns,5_Day_returns,10_Day_returns
1405,AMZN,2021-08-02,0.001169,-0.099556,-0.061447
1406,AMZN,2021-08-03,0.010434,-0.071738,-0.057917
1407,AMZN,2021-08-04,-0.003422,-0.075916,-0.064287
1408,AMZN,2021-08-05,0.00634,-0.062204,-0.072028
1409,AMZN,2021-08-06,-0.009197,0.005214,-0.085242
1410,AMZN,2021-08-09,-0.000918,0.003119,-0.096748
1411,AMZN,2021-08-10,-0.006341,-0.013534,-0.084301
1412,AMZN,2021-08-11,-0.008604,-0.018663,-0.093163
1413,AMZN,2021-08-12,0.00346,-0.021472,-0.082341
1414,AMZN,2021-08-13,-0.002885,-0.015238,-0.010103


In [12]:
# Prepare df for ML classification models
ml_df_returns = df_returns.copy()

ml_df_returns["1_Day_binary"] = np.where(ml_df_returns["1_Day_returns"] >= 0, 1, 0)
ml_df_returns["5_Day_binary"] = np.where(ml_df_returns["5_Day_returns"] >= 0, 1, 0)
ml_df_returns["10_Day_binary"] = np.where(ml_df_returns["10_Day_returns"] >= 0, 1, 0)

ml_df_returns.head(10)

Unnamed: 0,level_0,Date,1_Day_returns,5_Day_returns,10_Day_returns,1_Day_binary,5_Day_binary,10_Day_binary
0,AMZN,2015-12-31,,,,0,0,0
1,AMZN,2016-01-04,-0.057554,,,0,0,0
2,AMZN,2016-01-05,-0.005024,,,0,0,0
3,AMZN,2016-01-06,-0.001799,,,0,0,0
4,AMZN,2016-01-07,-0.039058,,,0,0,0
5,AMZN,2016-01-08,-0.001464,-0.101851,,0,0,0
6,AMZN,2016-01-11,0.01761,-0.03022,,1,0,0
7,AMZN,2016-01-12,0.000243,-0.025087,,1,0,0
8,AMZN,2016-01-13,-0.058392,-0.08036,,0,0,0
9,AMZN,2016-01-14,0.019233,-0.024575,,1,0,0


##

# Data Cleaning #2

In [13]:
# Drop na
df_returns.dropna(axis=0, 
                  how="any",
                  inplace=True)

# Create a multi index based on stock name and time
df_returns.set_index(['level_0', 'Date'],
                  inplace=True)

df_returns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,1_Day_returns,5_Day_returns,10_Day_returns
level_0,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AMZN,2016-01-15,-0.038482,-0.060736,-0.156401
AMZN,2016-01-19,0.007541,-0.070029,-0.098133
AMZN,2016-01-20,-0.004717,-0.074641,-0.097856
AMZN,2016-01-21,0.005684,-0.01167,-0.091093
AMZN,2016-01-22,0.037147,0.0057,-0.019015


In [14]:
#Drop duplicates
df_returns.drop_duplicates(inplace=False)

#Drop Null and inf
df_returns=df_returns.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [15]:
# Drop na
ml_df_returns.dropna(axis=0, 
                     how="any",
                     inplace=True)

# Create a multi index based on stock name and time
ml_df_returns.set_index(['level_0', 'Date'],
                        inplace=True)

ml_df_returns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,1_Day_returns,5_Day_returns,10_Day_returns,1_Day_binary,5_Day_binary,10_Day_binary
level_0,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AMZN,2016-01-15,-0.038482,-0.060736,-0.156401,0,0,0
AMZN,2016-01-19,0.007541,-0.070029,-0.098133,1,0,0
AMZN,2016-01-20,-0.004717,-0.074641,-0.097856,0,0,0
AMZN,2016-01-21,0.005684,-0.01167,-0.091093,1,0,0
AMZN,2016-01-22,0.037147,0.0057,-0.019015,1,1,0


In [16]:
#Drop duplicates
ml_df_returns.drop_duplicates(inplace=False)

#Drop Null and inf
ml_df_returns=ml_df_returns.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

##

# Final check

In [17]:
# Count nulls/Checking final df
df_returns.isnull().sum()

1_Day_returns     0
5_Day_returns     0
10_Day_returns    0
dtype: int64

In [18]:
df_returns.describe()

Unnamed: 0,1_Day_returns,5_Day_returns,10_Day_returns
count,1415.0,1415.0,1415.0
mean,0.001398,0.006828,0.013332
std,0.018649,0.040037,0.055353
min,-0.079221,-0.155961,-0.20702
25%,-0.007111,-0.014052,-0.015335
50%,0.001484,0.007075,0.014002
75%,0.010424,0.029086,0.044572
max,0.132164,0.178752,0.262353


In [19]:
ml_df_returns.isnull().sum()

1_Day_returns     0
5_Day_returns     0
10_Day_returns    0
1_Day_binary      0
5_Day_binary      0
10_Day_binary     0
dtype: int64

In [20]:
ml_df_returns.describe()

Unnamed: 0,1_Day_returns,5_Day_returns,10_Day_returns,1_Day_binary,5_Day_binary,10_Day_binary
count,1415.0,1415.0,1415.0,1415.0,1415.0,1415.0
mean,0.001398,0.006828,0.013332,0.555477,0.585866,0.628269
std,0.018649,0.040037,0.055353,0.497088,0.492746,0.483438
min,-0.079221,-0.155961,-0.20702,0.0,0.0,0.0
25%,-0.007111,-0.014052,-0.015335,0.0,0.0,0.0
50%,0.001484,0.007075,0.014002,1.0,1.0,1.0
75%,0.010424,0.029086,0.044572,1.0,1.0,1.0
max,0.132164,0.178752,0.262353,1.0,1.0,1.0


In [21]:
# Save the dataframe as a csv file
df_returns.to_csv("Returns_ForTimeSeries.csv")
ml_df_returns.to_csv("Returns_ForML_Classification.csv")