<a href="https://colab.research.google.com/github/TDStriker/Projects-in-ML/blob/main/ML_Proj_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Potential data

https://www.kaggle.com/datasets/samuelcortinhas/time-series-practice-dataset

For this project I plan on making a model that will predict the quantity of a product sold given a date, store, and product id. Since this problem is dependent on time information most of all, an RNN is best suited to the task.

Dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import random
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [None]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
print(data)

              Date  store  product  number_sold
0       2010-01-01      0        0          801
1       2010-01-02      0        0          810
2       2010-01-03      0        0          818
3       2010-01-04      0        0          796
4       2010-01-05      0        0          808
...            ...    ...      ...          ...
230085  2018-12-27      6        9          890
230086  2018-12-28      6        9          892
230087  2018-12-29      6        9          895
230088  2018-12-30      6        9          899
230089  2018-12-31      6        9          912

[230090 rows x 4 columns]


In [None]:
#Converting date strings into numerical values
le = LabelEncoder()
data['Date'] = le.fit_transform(data['Date'])
test['Date'] = le.fit_transform(test['Date'])

In [None]:
print(data)

        Date  store  product  number_sold
0          0      0        0          801
1          1      0        0          810
2          2      0        0          818
3          3      0        0          796
4          4      0        0          808
...      ...    ...      ...          ...
230085  3282      6        9          890
230086  3283      6        9          892
230087  3284      6        9          895
230088  3285      6        9          899
230089  3286      6        9          912

[230090 rows x 4 columns]


In [None]:
data.corr()

Unnamed: 0,Date,store,product,number_sold
Date,1.0,8.550476e-16,-5.980923e-16,0.009668
store,8.550476e-16,1.0,6.692764e-15,0.201994
product,-5.980923e-16,6.692764e-15,1.0,0.033124
number_sold,0.009667716,0.2019938,0.03312389,1.0


In [None]:
#Train-dev-test split
half = int(len(test)/2)
valid = test.iloc[:half]
test = test.iloc[half:]

In [None]:
#Feature Label split
train_label = data["number_sold"].to_numpy()
train_feat = data.drop(columns=["number_sold"]).to_numpy()
test_label = test["number_sold"].to_numpy()
test_feat = test.drop(columns=["number_sold"]).to_numpy()
valid_label = valid["number_sold"].to_numpy()
valid_feat = valid.drop(columns=["number_sold"]).to_numpy()

Task 2

2-1

The RNN is implemented using PyTorch, which uses tanh as the default activation function for RNN layers. It uses the basic RNN cell structure of cells receiving both input and hidden information, which it uses to produce output and send hidden information to the next cell.

The final loss output will be used to measure the effectiveness of the model.

In [None]:
#Hyperparameters
learning_rate=1e-3
layer_size=50
num_hidden_layers=2
batch_size = 64
epochs = 10

in_dim = 3
out_dim=1

In [None]:
class RNNetwork(nn.Module):
    def __init__(self,in_dim,out_dim,layer_size,num_hidden_layers=1):
        super().__init__()
        self.in_dim = in_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_layers = num_hidden_layers*2+1
        self.layer_size = layer_size

        self.rnn = nn.RNN(in_dim, layer_size, num_hidden_layers, batch_first=True)

        self.lin = nn.Linear(layer_size, out_dim)

    def forward(self, x):
        hidden = torch.zeros(self.num_hidden_layers, self.layer_size)

        out, hidden = self.rnn(x, hidden)

        out = out.contiguous().view(-1, self.layer_size)
        out = self.lin(out)

        return out

In [None]:
def load_data(feats,labels):
  segments = int(len(feats)/batch_size + .999)
  out = [0]*segments
  for i in range(segments):
    out[i]=[feats[i*batch_size:(i+1)*batch_size],labels[i*batch_size:(i+1)*batch_size],i]
  return out

In [None]:
def train_loop(feats,labels, model, loss_fn, optimizer):
    size = len(feats)

    model.train()
    for (X, y, batch) in load_data(feats,labels):
        # Compute prediction and loss
        X = torch.tensor(X.astype(np.float32))
        y = torch.tensor(y.astype(np.float32))

        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(feats,labels, model, loss_fn):
    model.eval()
    size = len(feats)
    num_batches = int((len(feats)/batch_size)+.999)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y, _ in load_data(feats,labels):
            X = torch.tensor(X.astype(np.float32))
            y = torch.tensor(y.astype(np.float32))

            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    print(f"Avg loss: {test_loss:>8f} \n")

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

model = RNNetwork(in_dim,out_dim,layer_size,num_hidden_layers)

# Initialize the loss function
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_feat,train_label, model, loss_fn, optimizer)
    test_loop(valid_feat,valid_label, model, loss_fn)
print("Done!")

In [None]:
test_loop(test_feat,test_label, model, loss_fn)

2-2

LSTM

In [None]:
import torch
import torch.nn as nn

class LSTMNetwork(nn.Module):
    def __init__(self, in_dim, out_dim, layer_size, num_hidden_layers=1):
        super().__init__()
        self.in_dim = in_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_layers = num_hidden_layers * 2 + 1
        self.layer_size = layer_size

        self.lstm = nn.LSTM(in_dim, layer_size, num_hidden_layers, batch_first=True)

        self.lin = nn.Linear(layer_size, out_dim)

    def forward(self, x):
        hidden = (torch.zeros(self.num_hidden_layers, self.layer_size),
                  torch.zeros(self.num_hidden_layers, self.layer_size))

        out, hidden = self.lstm(x, hidden)

        out = out.contiguous().view(-1, self.layer_size)
        out = self.lin(out)

        return out


In [None]:
lstm_model = LSTMNetwork(in_dim,out_dim,layer_size,num_hidden_layers)

# Initialize the loss function
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=learning_rate)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_feat,train_label, lstm_model, loss_fn, optimizer)
    test_loop(valid_feat,valid_label, lstm_model, loss_fn)
print("Done!")

In [None]:
test_loop(test_feat,test_label, lstm_model, loss_fn)

GRU

In [None]:
class GRUNetwork(nn.Module):
    def __init__(self,in_dim,out_dim,layer_size,num_hidden_layers=1):
        super().__init__()
        self.in_dim = in_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_layers = num_hidden_layers*2+1
        self.layer_size = layer_size

        self.gru = nn.GRU(in_dim, layer_size, num_hidden_layers, batch_first=True)

        self.lin = nn.Linear(layer_size, out_dim)

    def forward(self, x):
        hidden = torch.zeros(self.num_hidden_layers, self.layer_size)

        out, hidden = self.gru(x, hidden)

        out = out.contiguous().view(-1, self.layer_size)
        out = self.lin(out)

        return out

In [None]:
gru_model = GRUNetwork(in_dim,out_dim,layer_size,num_hidden_layers)

# Initialize the loss function
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(gru_model.parameters(), lr=learning_rate)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_feat,train_label, gru_model, loss_fn, optimizer)
    test_loop(valid_feat,valid_label, gru_model, loss_fn)
print("Done!")

In [None]:
test_loop(test_feat,test_label, gru_model, loss_fn)

All three implementations get similar end results with the data and model being applied here. On average The classic RNN seems the get the best results which may be due to the simplicity of the dataset. The LSTM runs far quicker than the other two which is likely because it is better at processing larger sets of data.

2-3: Yes, time series data can be converted into regular feature data by providing additional information on the last x number of data points which can then be fed to a regular feed-foward network

Task 3

For the dissimilarity score I took the norm of the difference between the encodings of the two vectors. Since this is a method used to find the distance between two vectors it should work as a metric for determining how distant two words are.

In [None]:
import numpy as np
from gensim.models import KeyedVectors

# Download pre-trained Word2Vec embeddings from gensim
import gensim.downloader as api
word_vectors = api.load("word2vec-google-news-300")

In [None]:
def get_cosine_similarity(word1, word2):
      similarity = word_vectors.similarity(word1, word2)
      return similarity

def get_dissimilarity_score(word1, word2):
      vector1 = word_vectors[word1]
      vector2 = word_vectors[word2]
      dissimilarity = np.linalg.norm(vector1 - vector2)
      return dissimilarity

In [None]:
word1 = input("Word 1: ")
word2 = input("Word 2: ")

cosine_similarity = get_cosine_similarity(word1, word2)
dissimilarity_score = get_dissimilarity_score(word1, word2)

print(f"Cosine Similarity between '{word1}' and '{word2}': {cosine_similarity}")
print(f"Dissimilarity Score between '{word1}' and '{word2}': {dissimilarity_score}")
