In [1]:
# **************************
# Author: Sebastien Vezina
# Date: 05-Nov-2020
# **************************

import os
import requests
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from functools import partial

In [2]:
csvpath = Path('../../../02-Data/Dirty Data/Market Data/CAD_5yr_drt.csv')

In [3]:
cad_data = pd.read_csv(csvpath, infer_datetime_format=True)

In [4]:
cad_data.head()

Unnamed: 0,date,1. open,2. high,3. low,4. close
0,2014-11-24,0.8893,0.8907,0.8835,0.8859
1,2014-11-25,0.8856,0.8901,0.8832,0.888
2,2014-11-26,0.8881,0.8904,0.8847,0.8887
3,2014-11-27,0.8883,0.8899,0.8802,0.8816
4,2014-11-28,0.8812,0.882,0.8735,0.8755


In [5]:
# Keep only Date and Close columns
cad_data.drop(['1. open', '2. high', '3. low'], axis=1, inplace=True)

In [6]:
cad_data.head()

Unnamed: 0,date,4. close
0,2014-11-24,0.8859
1,2014-11-25,0.888
2,2014-11-26,0.8887
3,2014-11-27,0.8816
4,2014-11-28,0.8755


In [7]:
# Rename columns
cad_data.columns = ['Date', 'Close']

cad_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1557 entries, 0 to 1556
Data columns (total 2 columns):
Date     1557 non-null object
Close    1557 non-null float64
dtypes: float64(1), object(1)
memory usage: 24.4+ KB


In [8]:
# Select only from 2015-10-01 to 2020-11-01
start_date = pd.Timestamp("10/31/2015")
end_date = pd.Timestamp("11/01/2020")

#Set Date column to 'Timestamp' format
cad_data['Date'] = pd.to_datetime(cad_data['Date'])

#Slice Dataframe
cad_data = cad_data[(cad_data['Date'] > (start_date)) & (cad_data['Date'] < (end_date))]

# Set index back
cad_data.set_index("Date", inplace = True)
# Sort Index
cad_data = cad_data.sort_index(ascending=True)

cad_data

In [11]:
cad_data.to_csv('../../../02-Data/Clean Data/Market Data/CAD_5yr_cln.csv')

In [12]:
#-------------------------------------------------------------------------------------------------------

In [13]:
# Clean TSX60 Dataframe
# TSX from 2015-10-01 to 2020-11-01
SP_TSX500_file = Path("../../../02-Data/Dirty Data/Market Data/SPTSX60 Historical.csv")
SP_TSX500_data = pd.read_csv(SP_TSX500_file, index_col="Date")
SP_TSX500_data

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Oct 30, 2020",928.90,930.88,932.10,919.76,112.89M,-0.63%
"Oct 29, 2020",934.83,934.78,939.48,926.36,101.86M,0.42%
"Oct 28, 2020",930.90,944.45,944.45,930.23,147.62M,-2.50%
"Oct 27, 2020",954.81,959.93,960.71,953.49,107.18M,-0.53%
"Oct 26, 2020",959.91,966.43,967.16,954.08,115.59M,-1.30%
"Oct 23, 2020",972.54,972.23,973.84,967.95,100.57M,0.21%
"Oct 22, 2020",970.52,966.48,971.68,962.61,107.29M,0.40%
"Oct 21, 2020",966.63,968.80,972.48,965.42,98.33M,-0.50%
"Oct 20, 2020",971.53,977.19,977.96,969.81,119.45M,-0.07%
"Oct 19, 2020",972.22,985.23,985.23,971.43,119.06M,-1.06%


In [14]:
# Remove useles column
SP_TSX500_data.reset_index(inplace=True)

# Convert Date column to datetime type
SP_TSX500_data['Date'] = pd.to_datetime(SP_TSX500_data['Date'])

# Change column order and keep only relevant ones
SP_TSX500_data_sliced = SP_TSX500_data[['Date', 'Price']].set_index('Date')

SP_TSX500_data_sliced.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1255 entries, 2020-10-30 to 2015-11-02
Data columns (total 1 columns):
Price    1255 non-null object
dtypes: object(1)
memory usage: 19.6+ KB


In [15]:
# Save to CSV file
SP_TSX500_data_sliced.to_csv('../../../02-Data/Clean Data/Market Data/SPTSX60_5yr_cln.csv')