# Install this (important)

In [1]:
# Install the latest version of author's repo neural ode implementation
!git clone https://github.com/rtqichen/torchdiffeq.git
!cd torchdiffeq && pip install -e .
!ls torchdiffeq/torchdiffeq

Cloning into 'torchdiffeq'...
remote: Enumerating objects: 1132, done.[K
remote: Counting objects: 100% (428/428), done.[K
remote: Compressing objects: 100% (188/188), done.[K
remote: Total 1132 (delta 253), reused 401 (delta 240), pack-reused 704[K
Receiving objects: 100% (1132/1132), 8.29 MiB | 30.32 MiB/s, done.
Resolving deltas: 100% (679/679), done.
Obtaining file:///content/torchdiffeq
Installing collected packages: torchdiffeq
  Running setup.py develop for torchdiffeq
Successfully installed torchdiffeq-0.2.2
_impl  __init__.py


In [3]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-0.1.63.tar.gz (26 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.6.3-cp37-cp37m-manylinux2014_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 7.6 MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.63-py2.py3-none-any.whl size=23918 sha256=40607f535fcf5d584598b3c9ca1fcdf402c9edfc7282a57d69dcd09c3c2f6910
  Stored in directory: /root/.cache/pip/wheels/fe/87/8b/7ec24486e001d3926537f5f7801f57a74d181be25b11157983
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfully installed lxml-4.6.3 yfinance-0.1.63


# K Means

## Rough of K Means

1. Get Open and Close Price of asset (o, c) for each trading day.
2. Transform it into sequences.
    - $d_{i} : {o_{1}c_{1} ... o_{5}c_{5}}$
    - where $d_{i}$ is a sequence of o and c for the week `i`.
3. Transform $d_{i}$ to sequences of lag * len($d_{i}$) length.
4. Normalize these sequences to a range (0, 1).
5. Clustering algorithm. 

In [13]:
# 1. Get Open and Close Price of asset (o, c) for each trading day.
# libraries
from pandas_datareader import data as pdr
import yfinance as yf
import os

print(f"Get Open and Close Price of Assets")
def download_raw_stock_data(filepath, tickers, start, end, period = '1d'):
    """
    Download Stock tickers
    :Parameters:
        filepath: str
            path to store the raw data
        tickers : str, list
            List of tickers to download
        period: str
            the frequency at which to gather the data; common options would include ‘1d’ (daily), ‘1mo’ (monthly), ‘1y’ (yearly)
        start: str
            the date to start gathering the data. For example ‘2010–1–1’
        end: str
            the date to end gathering the data. For example ‘2020–1–25’
    
    """
    #define the ticker symbol
    tickerSymbol = tickers

    #get data on this ticker
    tickerData = yf.Ticker(tickerSymbol)

    #get the historical prices for this ticker
    tickerDf = tickerData.history(period=period, start=start, end=end)
    tickerDf.to_csv(filepath)

dict_tickers = {
    'Apple': 'AAPL',
    'Microsoft': 'MSFT',
    'Google': 'GOOG',
    'Bitcoin': 'BTC-USD',
    'Facebook': 'FB',
    'Walmart': 'WMT',
    'Amazon': 'AMZN',
    'CVS': 'CVS',
    'Berkshire': 'BRK-B',
    'ExxonMobil': 'XOM',
    'AtandT': 'T',
    'Costco': 'COST',
    'Walgreens': 'WBA',
    'Kroger': 'KR',
    'JPMorgan': 'JPM',
    'Verizon': 'VZ',
    'FordMotor': 'F',
    'GeneralMotors': 'GM',
    'Dell': 'DELL',
    'BankOfAmerica': 'BAC',
    'Target': 'TGT',
    'GeneralElectric': 'GE',
    'JohnsonandJohnson': 'JNJ',
    'Nvidia': 'NVDA',
    'Intel': 'INTC',
}

path = f"raw-stock-data/data-1970-2021"
if not os.path.exists(path):
    # https://appdividend.com/2021/07/03/how-to-create-directory-if-not-exist-in-python/
    # Create a new directory
    os.makedirs(path)
    print(f"{path} directory is created")
period = '1d'
start='1970-1-1'
end='2021-8-31'
for tickerName, ticker in dict_tickers.items():
    tickerName = tickerName
    ticker = ticker
    filepath = f"{path}/{tickerName}.csv"
    download_raw_stock_data(filepath, ticker, start, end, period)


print('\n')

print(f"The size of each asset")
import pandas as pd
for tickerName in dict_tickers.keys():
    df = pd.read_csv(f"{path}/{tickerName}.csv")
    print(f"{tickerName} size: {len(df)}")

Get Open and Close Price of Assets


The size of each asset
Apple size: 10266
Microsoft size: 8940
Google size: 4288
Bitcoin size: 2537
Facebook size: 2336
Walmart size: 12340
Amazon size: 6114
CVS size: 12237
Berkshire size: 6371
ExxonMobil size: 13030
AtandT size: 9522
Costco size: 8859
Walgreens size: 10454
Kroger size: 13030
JPMorgan size: 10454
Verizon size: 9522
FordMotor size: 12420
GeneralMotors size: 2713
Dell size: 1268
BankOfAmerica size: 12199
Target size: 12238
GeneralElectric size: 13031
JohnsonandJohnson size: 13032
Nvidia size: 5690
Intel size: 10453


In [23]:
len(dict_tickers.keys())

25

In [None]:
# 2. Get weekly data.
# 3. Transform $d_{i}$ to sequences of lag * len($d_{i}$) length.

def stockDataTransformer(filepath):
    df = pd.read_csv(filepath)
    df.set_index('Date', inplace=True)
    df1 = df[['Open', 'Close']].copy()
    data = df1.values
    n_samples = data.shape[0]//10*10
    reshape_number = n_samples*data.shape[1]//10
    data1 = data[:n_samples].reshape((reshape_number, 10))
    return data1

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


In [18]:
from pandas import concat
week_sequence = {}
lag = 5
for tickerName in dict_tickers.keys():
    filepath = f"{path}/{tickerName}.csv"
    # Get the data in the required format
    data = stockDataTransformer(filepath)
    print(f"{tickerName} data.shape {data.shape}")
    data_orig = series_to_supervised(data, lag).values
    print(f'{tickerName} Data Original after series to supervised on data')
    print(data_orig.shape)
    week_sequence[tickerName] = data_orig

Apple data.shape (2052, 10)
Apple Data Original after series to supervised on data
(2047, 60)
Microsoft data.shape (1788, 10)
Microsoft Data Original after series to supervised on data
(1783, 60)
Google data.shape (856, 10)
Google Data Original after series to supervised on data
(851, 60)
Bitcoin data.shape (506, 10)
Bitcoin Data Original after series to supervised on data
(501, 60)
Facebook data.shape (466, 10)
Facebook Data Original after series to supervised on data
(461, 60)
Walmart data.shape (2468, 10)
Walmart Data Original after series to supervised on data
(2463, 60)
Amazon data.shape (1222, 10)
Amazon Data Original after series to supervised on data
(1217, 60)
CVS data.shape (2446, 10)
CVS Data Original after series to supervised on data
(2441, 60)
Berkshire data.shape (1274, 10)
Berkshire Data Original after series to supervised on data
(1269, 60)
ExxonMobil data.shape (2606, 10)
ExxonMobil Data Original after series to supervised on data
(2601, 60)
AtandT data.shape (1904, 1

In [20]:
data = week_sequence['Apple']
# 4. Bundle all sequences together
for tickerName in week_sequence.keys():
    if tickerName != 'Apple':
        data1 = week_sequence[tickerName]
        data = np.concatenate((data, data1))
        print(f"data.shape {data.shape}")
 

data.shape (3830, 60)
data.shape (4681, 60)
data.shape (5182, 60)
data.shape (5643, 60)
data.shape (8106, 60)
data.shape (9323, 60)
data.shape (11764, 60)
data.shape (13033, 60)
data.shape (15634, 60)
data.shape (17533, 60)
data.shape (19298, 60)
data.shape (21383, 60)
data.shape (23984, 60)
data.shape (26069, 60)
data.shape (27968, 60)
data.shape (30447, 60)
data.shape (30984, 60)
data.shape (31231, 60)
data.shape (33664, 60)
data.shape (36105, 60)
data.shape (38706, 60)
data.shape (41307, 60)
data.shape (42434, 60)
data.shape (44519, 60)


In [24]:
data_df = pd.DataFrame(data)
data_df.to_csv(f"all_assets_sequences.csv")

In [22]:
data.shape

(44519, 60)

In [19]:
# import numpy as np
# a1 = np.array([[1, 2, 3], [4, 5, 6]])
# a2 = np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]])
# a3 = np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]])
# np.concatenate((a1, a2, a3))

In [25]:
data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
0,0.1006,0.1006,0.095789,0.095352,0.08879,0.088353,0.09054,0.09054,0.093165,0.093165,0.098851,0.098851,0.103662,0.103662,0.108036,0.108036,0.113722,0.113722,0.124219,0.124219,0.125969,0.125969,0.123345,0.122907,0.119846,0.119408,0.12072,0.12072,0.118534,0.118096,0.113284,0.112847,0.108473,0.108036,0.106287,0.105849,0.111535,0.111535,0.111535,0.11066,0.107161,0.106724,0.107161,0.107161,0.109348,0.109348,0.108911,0.108473,0.115034,0.115034,0.111972,0.111535,0.113722,0.113722,0.115034,0.115034,0.115034,0.114597,0.113284,0.112847
1,0.098851,0.098851,0.103662,0.103662,0.108036,0.108036,0.113722,0.113722,0.124219,0.124219,0.125969,0.125969,0.123345,0.122907,0.119846,0.119408,0.12072,0.12072,0.118534,0.118096,0.113284,0.112847,0.108473,0.108036,0.106287,0.105849,0.111535,0.111535,0.111535,0.11066,0.107161,0.106724,0.107161,0.107161,0.109348,0.109348,0.108911,0.108473,0.115034,0.115034,0.111972,0.111535,0.113722,0.113722,0.115034,0.115034,0.115034,0.114597,0.113284,0.112847,0.112847,0.111972,0.108911,0.108473,0.104974,0.104537,0.099725,0.098851,0.093602,0.093165
2,0.125969,0.125969,0.123345,0.122907,0.119846,0.119408,0.12072,0.12072,0.118534,0.118096,0.113284,0.112847,0.108473,0.108036,0.106287,0.105849,0.111535,0.111535,0.111535,0.11066,0.107161,0.106724,0.107161,0.107161,0.109348,0.109348,0.108911,0.108473,0.115034,0.115034,0.111972,0.111535,0.113722,0.113722,0.115034,0.115034,0.115034,0.114597,0.113284,0.112847,0.112847,0.111972,0.108911,0.108473,0.104974,0.104537,0.099725,0.098851,0.093602,0.093165,0.096664,0.096664,0.100163,0.100163,0.100163,0.100163,0.1006,0.1006,0.096226,0.095352
3,0.113284,0.112847,0.108473,0.108036,0.106287,0.105849,0.111535,0.111535,0.111535,0.11066,0.107161,0.106724,0.107161,0.107161,0.109348,0.109348,0.108911,0.108473,0.115034,0.115034,0.111972,0.111535,0.113722,0.113722,0.115034,0.115034,0.115034,0.114597,0.113284,0.112847,0.112847,0.111972,0.108911,0.108473,0.104974,0.104537,0.099725,0.098851,0.093602,0.093165,0.096664,0.096664,0.100163,0.100163,0.100163,0.100163,0.1006,0.1006,0.096226,0.095352,0.095352,0.095352,0.092728,0.09229,0.091853,0.091415,0.090103,0.089228,0.091415,0.091415
4,0.107161,0.106724,0.107161,0.107161,0.109348,0.109348,0.108911,0.108473,0.115034,0.115034,0.111972,0.111535,0.113722,0.113722,0.115034,0.115034,0.115034,0.114597,0.113284,0.112847,0.112847,0.111972,0.108911,0.108473,0.104974,0.104537,0.099725,0.098851,0.093602,0.093165,0.096664,0.096664,0.100163,0.100163,0.100163,0.100163,0.1006,0.1006,0.096226,0.095352,0.095352,0.095352,0.092728,0.09229,0.091853,0.091415,0.090103,0.089228,0.091415,0.091415,0.095352,0.095352,0.090103,0.089665,0.085292,0.084854,0.086166,0.086166,0.083979,0.083105


In [28]:
# 4. Normalize these sequences to a range (0, 1).
from sklearn import preprocessing 
# https://www.journaldev.com/45109/normalize-data-in-python
# Normalizes the sample
data_normalized = preprocessing.normalize(data_df)
data_normalized_df = pd.DataFrame(data_normalized)
data_normalized_df.to_csv(f"all_assets_sequences_lag{lag+1}.csv")

In [None]:
# 5. Clustering algorithm. 