# Preparing

## Install library

In [1]:
!nvidia-smi

Wed Mar 16 01:06:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 471.41       Driver Version: 471.41       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   61C    P0    32W /  N/A |    329MiB /  6144MiB |      7%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# !pip install --quiet pytorch-lightning==1.2.5
# !pip install --quiet tqdm==4.59.0

## Import libraries

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import rc, rcParams
import math

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.preprocessing import MinMaxScaler

import torch
from torch import autograd, nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
# HAPPY_COLORS_PALLETE = ["#01BEFE", "#FFDD00", "#FF7D00", "FF006D", "ADFF02", "8F00FF"]
# sns.set_palette(sns.color_palette(HAPPY_COLORS_PALLETE))

rcParams['figure.figsize'] = 12, 8

tqdm.pandas()

In [5]:
pl.seed_everything(42)

Global seed set to 42


42

# Data

In [6]:
watch = pd.read_csv("Datasets\Price_chart_Tang\Rolex 126334\Combine_all.csv")
watch.head()

Unnamed: 0,Date,Price
0,2017-06-09,9575.0
1,2017-06-10,9575.0
2,2017-06-14,9575.0
3,2017-06-23,9575.0
4,2017-06-24,4791.0


In [7]:
gold = pd.read_csv("Datasets\Gold_price.csv")
gold['Gold price (USD)'] = gold['Gold price (USD)'].str.replace(',','')
# gold.head()

In [8]:
watch['Date'] = pd.to_datetime(watch['Date'])
watch['Price'] = pd.to_numeric(watch['Price'])
gold['Date'] = pd.to_datetime(gold['Date'])
gold['Gold price (USD)'] = pd.to_numeric(gold['Gold price (USD)'])
print(type(gold.iloc[0,1]))

<class 'numpy.float64'>


In [9]:
watch = watch.resample('W', on='Date', convention='s').mean()
watch['Date'] = watch.index
watch.reset_index(drop=True, inplace=True)

print(watch.shape)
watch.head()

(249, 2)


Unnamed: 0,Price,Date
0,9575.0,2017-06-11
1,9575.0,2017-06-18
2,7183.0,2017-06-25
3,6896.0,2017-07-02
4,6896.0,2017-07-09


In [10]:
watch["Prev_Price"] = watch.shift(1)["Price"]
watch.head()

Unnamed: 0,Price,Date,Prev_Price
0,9575.0,2017-06-11,
1,9575.0,2017-06-18,9575.0
2,7183.0,2017-06-25,9575.0
3,6896.0,2017-07-02,7183.0
4,6896.0,2017-07-09,6896.0


In [11]:
# tqdm.pandas()
watch["Price_Change"] = watch.progress_apply(
    lambda row: 0 if np.isnan(row["Prev_Price"]) else row["Price"] - row["Prev_Price"],
    axis = 1
)
watch.head()

  0%|          | 0/249 [00:00<?, ?it/s]

Unnamed: 0,Price,Date,Prev_Price,Price_Change
0,9575.0,2017-06-11,,0.0
1,9575.0,2017-06-18,9575.0,0.0
2,7183.0,2017-06-25,9575.0,-2392.0
3,6896.0,2017-07-02,7183.0,-287.0
4,6896.0,2017-07-09,6896.0,0.0


In [12]:
rows = []

for _, row in tqdm(watch.iterrows(), total=watch.shape[0]):
    row_data = dict(
        day_of_week = row.Date.dayofweek,
        day_of_month = row.Date.day,
        week_of_year = row.Date.week,
        month = row.Date.month,
        price_change = row.Price_Change,
        price = row.Price
    )
    rows.append(row_data)

features_df = pd.DataFrame(rows)

  0%|          | 0/249 [00:00<?, ?it/s]

In [13]:
print(features_df.shape)
features_df.head()

(249, 6)


Unnamed: 0,day_of_week,day_of_month,week_of_year,month,price_change,price
0,6,11,23,6,0.0,9575.0
1,6,18,24,6,0.0,9575.0
2,6,25,25,6,-2392.0,7183.0
3,6,2,26,7,-287.0,6896.0
4,6,9,27,7,0.0,6896.0


In [14]:
train_size = int(len(features_df) * 0.8)
print(train_size)

199


In [15]:
train_df, test_df = features_df[:train_size], features_df[train_size:]
print(train_df.shape, test_df.shape)

(199, 6) (50, 6)


In [16]:
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [17]:
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index=train_df.index,
    columns=train_df.columns
)

In [18]:
test_df = pd.DataFrame(
    scaler.transform(test_df),
    index=test_df.index,
    columns=test_df.columns
)

In [19]:
test_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,price_change,price
199,-1.0,-0.8,-0.538462,-0.454545,0.296609,0.990662
200,-1.0,-0.333333,-0.5,-0.454545,0.251698,0.954704
201,-1.0,0.133333,-0.461538,-0.454545,0.301076,0.948014
202,-1.0,0.6,-0.423077,-0.454545,0.322082,0.953775
203,-1.0,-0.933333,-0.384615,-0.272727,0.298333,0.945459


In [20]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length):

    sequences = []
    data_size = len(input_data)

    for i in tqdm(range(data_size - sequence_length)):

        sequence = input_data[i:i+sequence_length]

        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        
        sequences.append((sequence, label))

    return sequences

In [21]:
SEQUENCE_LENGTH = 26

train_sequences = create_sequences(train_df, "price", SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "price", SEQUENCE_LENGTH)

  0%|          | 0/173 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

In [22]:
print(f"Sequence shape: {train_sequences[0][0].shape}")
print(f"Train length: {len(train_sequences)}")
print(f"Test length: {len(test_sequences)}")
print(f"Test length: {train_sequences[0][1]}")
train_sequences[0][0].head()

Sequence shape: (26, 6)
Train length: 173
Test length: 24
Test length: -0.9999999999999998


Unnamed: 0,day_of_week,day_of_month,week_of_year,month,price_change,price
0,-1.0,-0.333333,-0.153846,-0.090909,0.312363,0.139837
1,-1.0,0.133333,-0.115385,-0.090909,0.312363,0.139837
2,-1.0,0.6,-0.076923,-0.090909,-1.0,-0.638049
3,-1.0,-0.933333,-0.038462,0.090909,0.154901,-0.731382
4,-1.0,-0.466667,0.0,0.090909,0.312363,-0.731382


# Classes create

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print("configured device:", device)

configured device: cpu


In [24]:
class TS_Dataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        
        sequence, label = self.sequences[idx]

        return dict(
            sequence = torch.Tensor(sequence.to_numpy()).to(device),
            label = torch.tensor(label).float().to(device)
        )

In [25]:
class PriceDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size=8):
        super().__init__
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self):
        self.train_dataset = TS_Dataset(self.train_sequences)
        self.test_dataset = TS_Dataset(self.test_sequences)
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=0
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=0
        )
        
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=0
        )

In [26]:
N_EPOCHS = 8
BATCH_SIZE = 8

data_module = PriceDataModule(train_sequences, test_sequences, batch_size=BATCH_SIZE)
data_module.setup()

In [27]:
train_dataset = TS_Dataset(train_sequences)

In [28]:
for item in train_dataset:
    print(item["sequence"].shape)
    print(item["label"].shape)
    print(item["label"])
    break

torch.Size([26, 6])
torch.Size([])
tensor(-1.)


In [29]:
class PricePredictionModel(nn.Module):

    def __init__(self, n_features, n_hidden=128, n_layers=2):
        super().__init__()

        self.n_hidden = n_hidden

        self.lstm = nn.LSTM(
            input_size = n_features,
            hidden_size =n_hidden,
            batch_first = True,
            num_layers = n_layers,
            dropout = 0.2
        )

        self.regressor = nn.Linear(n_hidden, 1)

    def forward(self, x):
        self.lstm.flatten_parameters()

        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]

        return self.regressor(out)

In [30]:
class PricePredictor(pl.LightningModule):

    def __init__(self, n_features: int):
        super().__init__()
        self.model = PricePredictionModel(n_features).to(device)
        self.criterion = nn.MSELoss().to(device)

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))

        return loss, output
    
    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.0001)

In [31]:
model = PricePredictor(n_features=train_df.shape[1])

In [32]:
for item in data_module.train_dataloader():
    print(item["sequence"].shape)
    print(item["label"].shape)
    break

torch.Size([8, 26, 6])
torch.Size([8])


In [33]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

Reusing TensorBoard on port 6006 (pid 17812), started 0:35:14 ago. (Use '!kill 17812' to kill it.)

In [44]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning_logs", name="watch-price")

early_stopping_callback = EarlyStopping('test_loss')

trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    callbacks=[early_stopping_callback],
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [48]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | PricePredictionModel | 201 K 
1 | criterion | MSELoss              | 0     
---------------------------------------------------
201 K     Trainable params
0         Non-trainable params
201 K     Total params
0.807     Total estimated model params size (MB)
Trainer was signaled to stop but required minimum epochs (1) or minimum steps (None) has not been met. Training will continue...


Epoch 0: 100%|██████████| 22/22 [00:00<00:00, 99.06it/s, loss=nan, v_num=2, test_loss=nan.0] 


In [49]:
trained_model = PricePredictor.load_from_checkpoint(
    "lightning_logs/watch-price/version_2/checkpoints/epoch=0-step=87.ckpt",
    n_features=train_df.shape[1]
)

In [50]:
trained_model.freeze()

In [51]:
test_dataset = TS_Dataset(test_sequences)

predictions = []
labels = []

for item in tqdm(test_dataset):
    sequence = item['sequence']
    label = item['label']

    _, output = trained_model(sequence.unsqueeze(dim=0))
    predictions.append(output.item())
    labels.append(label.item())

  0%|          | 0/24 [00:00<?, ?it/s]

In [39]:
len(predictions), len(test_df)-SEQUENCE_LENGTH

(24, 24)