In [2]:
import torch
import numpy as np
import math
import torch.nn as nn
import pandas as pd
import re

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
svd_embeddings = torch.load("svd-word-vectors1.pt")
word_indices = torch.load("svd-word-vectors-indices1.pt")
svd_embeddings, word_indices

(array([[-2.43799097e-28,  5.09476327e-14, -2.13425680e-14, ...,
         -1.40884866e-27, -1.14366508e-28, -1.07118553e-17],
        [-1.74929039e-27,  1.92066895e-27, -3.57714299e-14, ...,
          3.17831752e-13, -2.62299787e-13,  7.95804729e-14],
        [-8.42817093e-21, -1.05777621e-19, -1.18901570e-32, ...,
          4.87978694e-21,  1.22386488e-21, -7.59481450e-22],
        ...,
        [ 2.94370661e-21,  5.45833022e-21,  1.52278608e-21, ...,
         -2.92611030e-22, -7.78838574e-23,  3.99039340e-23],
        [-2.89936386e-20, -5.05551754e-20, -1.54139272e-20, ...,
          2.83231611e-21,  7.55045559e-22, -3.92232633e-22],
        [ 4.61246621e-21,  8.15461565e-21,  2.38549478e-21, ...,
         -4.52367808e-22, -1.14266903e-22,  6.40721615e-23]]),
 {'Reuters': 0,
  '-': 1,
  'Short': 2,
  'sellers': 3,
  ',': 4,
  'Wall': 5,
  "Street's": 6,
  'dwindling': 7,
  'band': 8,
  'of': 9,
  'ultra': 10,
  'cynics': 11,
  'are': 12,
  'seeing': 13,
  'green': 14,
  'again': 15,
 

In [10]:
class LSTM(nn.Module):
    def __init__(self, hidden_dim, output_dim, embedding_dim = 50, num_layers = 1):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        
        # Initialize hidden state
        output, (hn, cn) = self.lstm(x, (h0, c0))
        
        # Take the output from the last time step

        output_last = output[:, -1]
        # Pass it through the fully connected layer
        output_fc = self.fc(output_last)
        
        return output_fc

        # output, (_, _) = self.lstm(x)
        # return self.fc(output)

In [6]:
oov_embeddings = torch.randn(50, dtype = torch.float32)

In [7]:
data_train = pd.read_csv('train.csv')
length = []
for i in range(len(data_train['Description'])):
    sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][i])
    length.append(len(sentence))
length.sort()
max_len = length[int(0.95*len(length))]
max_len

59

In [8]:
data = pd.read_csv('test.csv')
data

Unnamed: 0,Class Index,Description
0,3,Unions representing workers at Turner Newall...
1,4,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,AP - A company founded by a chemistry research...
3,4,AP - It's barely dawn when Mike Fitzpatrick st...
4,4,AP - Southern California's smog-fighting agenc...
...,...,...
7595,1,Ukrainian presidential candidate Viktor Yushch...
7596,2,With the supply of attractive pitching options...
7597,2,Like Roger Clemens did almost exactly eight ye...
7598,3,SINGAPORE : Doctors in the United States have ...


In [11]:
hidden_dim = 256
output_dim = data['Class Index'].unique().shape[0]

model = LSTM(hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
model

LSTM(
  (lstm): LSTM(50, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [14]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-batch_size*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.34581666666666666	Loss = 0.013166915625333786
Epoch = 1	Training Set Accuracy = 0.5025833333333334	Loss = 0.011420639231801033
Epoch = 2	Training Set Accuracy = 0.5476083333333334	Loss = 0.010677801445126534
Epoch = 3	Training Set Accuracy = 0.5728333333333333	Loss = 0.010225415229797363
Epoch = 4	Training Set Accuracy = 0.5912916666666667	Loss = 0.009859870187938213
Epoch = 5	Training Set Accuracy = 0.605925	Loss = 0.009560181759297848
Epoch = 6	Training Set Accuracy = 0.6214083333333333	Loss = 0.009255439043045044
Epoch = 7	Training Set Accuracy = 0.6335083333333333	Loss = 0.009005768224596977
Epoch = 8	Training Set Accuracy = 0.643925	Loss = 0.008801501244306564
Epoch = 9	Training Set Accuracy = 0.6517166666666667	Loss = 0.008624986745417118


In [15]:
batch_size = 100
train_loss = 0
correct = 0
model.eval()
for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                batch_embeddings[k-batch_size*i][j] = oov_embeddings
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.Tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)]).to(torch.int64)
    loss = criterion(y_pred, y_true-1)
    train_loss += loss

    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()
print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tLoss = {train_loss/len(data['Description'])}")

Testing Set Accuracy = 0.5853947368421053	Loss = 0.010319176129996777


In [15]:
from sklearn.metrics import precision_score, f1_score, recall_score, confusion_matrix

In [13]:
torch.save(model, "svd-classification-model.pt")

In [None]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(100*i, 100*i+100):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[100*i:100*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.39574166666666666	Loss = 0.012476691044867039
Epoch = 1	Training Set Accuracy = 1.0512333333333332	Loss = 0.02108396217226982
Epoch = 2	Training Set Accuracy = 1.7664	Loss = 0.02847791463136673
Epoch = 3	Training Set Accuracy = 2.5099	Loss = 0.035228148102760315
Epoch = 4	Training Set Accuracy = 3.27145	Loss = 0.041528452187776566
Epoch = 5	Training Set Accuracy = 4.048466666666667	Loss = 0.047477059066295624
Epoch = 6	Training Set Accuracy = 4.839433333333333	Loss = 0.053097303956747055
Epoch = 7	Training Set Accuracy = 5.640633333333334	Loss = 0.0584648959338665
Epoch = 8	Training Set Accuracy = 6.449108333333333	Loss = 0.06365052610635757
Epoch = 9	Training Set Accuracy = 7.265825	Loss = 0.06865249574184418


In [17]:
batch_size = 100
model.train()
precision = []
f1 = []
recall = []
correct = 0
train_loss = 0
import warnings
warnings.filterwarnings('ignore')


confusion = None
with torch.no_grad():
    for i in range(int(len(data_train['Description'])/batch_size)):
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()

        pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
        true = y_true.cpu().numpy()
        precision.append(precision_score(pred, true, average='weighted'))
        f1.append(f1_score(pred, true, average='weighted'))
        recall.append(recall_score(pred, true, average='weighted'))
        confusion = confusion_matrix(pred, true)
        print(f"Confusion Matrix for Batch = {i} is {confusion}")

    print(f"Training Set Accuracy = {correct / len(data_train['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {train_loss/len(data_train['Description'])}\tConfusion Matrix = {confusion}")

Confusion Matrix for Batch = 0 is [[ 0  0 17  6]
 [ 0  0 15  2]
 [ 0  0 14  3]
 [ 0  0 32 11]]
Confusion Matrix for Batch = 1 is [[ 0  0 29]
 [ 0  0 12]
 [ 0  0 59]]
Confusion Matrix for Batch = 2 is [[ 0  0  0 18]
 [ 0  0  0  9]
 [ 0  0  0  8]
 [ 0  0  0 65]]
Confusion Matrix for Batch = 3 is [[ 0  0  0 16]
 [ 0  0  0  7]
 [ 0  0  0  7]
 [ 0  0  0 70]]
Confusion Matrix for Batch = 4 is [[ 8  6  0  7]
 [ 0 31  0  6]
 [ 0  3  0  1]
 [ 0  4  0 34]]
Confusion Matrix for Batch = 5 is [[45  4  1  3]
 [ 9 11  1  1]
 [ 4  1  2  0]
 [ 6  1  2  9]]
Confusion Matrix for Batch = 6 is [[33  4  5  1]
 [ 2 35  2  0]
 [ 0  0  7  0]
 [ 6  2  1  2]]
Confusion Matrix for Batch = 7 is [[25  2  7  3]
 [ 5 20  2  2]
 [ 7  2  6  1]
 [ 2  1  4 11]]
Confusion Matrix for Batch = 8 is [[18  2  4  3]
 [ 1  5  1  1]
 [ 2  0 27  4]
 [ 5  1  3 23]]
Confusion Matrix for Batch = 9 is [[25  0  1  9]
 [ 3  3  0  5]
 [ 2  1 15  3]
 [ 4  3  5 21]]
Confusion Matrix for Batch = 10 is [[23  4  2  5]
 [ 5 14  3  7]
 [ 1  1 1

In [19]:
batch_size = len(data['Description'])
model.eval()
correct = 0
test_loss = 0
precision = []
f1 = []
recall = []


for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
            else:
                batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()

    pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
    true = y_true.cpu().numpy()
    precision.append(precision_score(pred, true, average='weighted'))
    f1.append(f1_score(pred, true, average='weighted'))
    recall.append(recall_score(pred, true, average='weighted'))
    confusion = confusion_matrix(pred, true)
    print(f"Confusion Matrix for Batch = {i} is {confusion}")

print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {test_loss/len(data['Description'])}")

Confusion Matrix for Batch = 0 is [[1111  270  292  321]
 [ 299 1335  202  261]
 [ 187   97  991  306]
 [ 303  198  415 1012]]
Testing Set Accuracy = 0.5853947368421053	Recall = 0.5853947368421053	Precision = 0.5909096952908587	F1-Score = 0.5866057942402126	Loss = 0.0001357786386506632


# Context Window = 1 Ends

In [20]:
svd_embeddings = torch.load("svd-word-vectors2.pt")
word_indices = torch.load("svd-word-vectors-indices2.pt")
svd_embeddings, word_indices

(array([[-5.37797350e-14, -2.01467669e-13, -2.08562156e-13, ...,
         -1.80462108e-25,  4.79097458e-14, -1.49890269e-14],
        [ 7.85153626e-28, -2.70133484e-14, -2.53050183e-14, ...,
          6.69037441e-14,  6.01993382e-13,  4.17369076e-13],
        [ 3.21973660e-18,  4.81106755e-18,  7.23425447e-31, ...,
          1.00389790e-19,  1.93548197e-20, -9.27567693e-20],
        ...,
        [-4.88779219e-20, -3.50257301e-20,  1.67269785e-20, ...,
         -1.07853147e-21, -1.95833910e-22,  9.58710434e-22],
        [ 4.74768916e-20,  3.39468553e-20, -1.62689718e-20, ...,
          1.06544429e-21,  1.79591432e-22, -9.54902989e-22],
        [ 2.49496885e-20,  1.81164416e-20, -8.65156902e-21, ...,
          5.43587200e-22,  9.80088934e-23, -4.76045461e-22]]),
 {'Reuters': 0,
  '-': 1,
  'Short': 2,
  'sellers': 3,
  ',': 4,
  'Wall': 5,
  "Street's": 6,
  'dwindling': 7,
  'band': 8,
  'of': 9,
  'ultra': 10,
  'cynics': 11,
  'are': 12,
  'seeing': 13,
  'green': 14,
  'again': 15,
 

In [22]:
hidden_dim = 256
output_dim = data['Class Index'].unique().shape[0]

model = LSTM(hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
model

LSTM(
  (lstm): LSTM(50, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [23]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-batch_size*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.3389333333333333	Loss = 0.013237746432423592
Epoch = 1	Training Set Accuracy = 0.5005666666666667	Loss = 0.011424871161580086
Epoch = 2	Training Set Accuracy = 0.54575	Loss = 0.010722115635871887
Epoch = 3	Training Set Accuracy = 0.5711416666666667	Loss = 0.010226264595985413
Epoch = 4	Training Set Accuracy = 0.5905166666666667	Loss = 0.009858565405011177
Epoch = 5	Training Set Accuracy = 0.6057166666666667	Loss = 0.009568077512085438
Epoch = 6	Training Set Accuracy = 0.6154166666666666	Loss = 0.009343933314085007
Epoch = 7	Training Set Accuracy = 0.6267416666666666	Loss = 0.009124214760959148
Epoch = 8	Training Set Accuracy = 0.6380333333333333	Loss = 0.00887986458837986
Epoch = 9	Training Set Accuracy = 0.6489583333333333	Loss = 0.008680996485054493


In [24]:
batch_size = 100
train_loss = 0
correct = 0
model.eval()
for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                batch_embeddings[k-batch_size*i][j] = oov_embeddings
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.Tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)]).to(torch.int64)
    loss = criterion(y_pred, y_true-1)
    train_loss += loss

    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()
print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tLoss = {train_loss/len(data['Description'])}")

Testing Set Accuracy = 0.5727631578947369	Loss = 0.010339873842895031


In [25]:
batch_size = 100
model.train()
precision = []
f1 = []
recall = []
correct = 0
train_loss = 0
import warnings
warnings.filterwarnings('ignore')


confusion = None
with torch.no_grad():
    for i in range(int(len(data_train['Description'])/batch_size)):
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()

        pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
        true = y_true.cpu().numpy()
        precision.append(precision_score(pred, true, average='weighted'))
        f1.append(f1_score(pred, true, average='weighted'))
        recall.append(recall_score(pred, true, average='weighted'))
        confusion = confusion_matrix(pred, true)
        print(f"Confusion Matrix for Batch = {i} is {confusion}")

    print(f"Training Set Accuracy = {correct / len(data_train['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {train_loss/len(data_train['Description'])}\tConfusion Matrix = {confusion}")

Confusion Matrix for Batch = 0 is [[ 0  0 12  8]
 [ 0  0 10  1]
 [ 0  0 18  3]
 [ 0  0 38 10]]
Confusion Matrix for Batch = 1 is [[ 0  0  0 23]
 [ 0  0  0 11]
 [ 0  0  0  2]
 [ 0  0  0 64]]
Confusion Matrix for Batch = 2 is [[ 0  0  0 14]
 [ 0  0  0  8]
 [ 0  0  0  6]
 [ 0  0  0 72]]
Confusion Matrix for Batch = 3 is [[ 0  0  0 14]
 [ 0  0  0 12]
 [ 0  0  0  6]
 [ 0  0  0 68]]
Confusion Matrix for Batch = 4 is [[ 8  3  0  5]
 [ 0 36  0  5]
 [ 0  2  0  7]
 [ 0  3  0 31]]
Confusion Matrix for Batch = 5 is [[49  3  0  4]
 [10 11  0  0]
 [ 1  1  2  1]
 [ 4  2  4  8]]
Confusion Matrix for Batch = 6 is [[32  4  2  1]
 [ 3 35  3  1]
 [ 0  0  9  0]
 [ 6  2  1  1]]
Confusion Matrix for Batch = 7 is [[27  3  3  1]
 [ 6 21  1  1]
 [ 4  1  9  3]
 [ 2  0  6 12]]
Confusion Matrix for Batch = 8 is [[15  3  3  3]
 [ 3  5  0  3]
 [ 4  0 29  5]
 [ 4  0  3 20]]
Confusion Matrix for Batch = 9 is [[24  0  2  8]
 [ 3  5  1  7]
 [ 1  0 12  2]
 [ 6  2  6 21]]
Confusion Matrix for Batch = 10 is [[21  3  4  3]


In [26]:
batch_size = len(data['Description'])
model.eval()
correct = 0
test_loss = 0
precision = []
f1 = []
recall = []


for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
            else:
                batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()

    pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
    true = y_true.cpu().numpy()
    precision.append(precision_score(pred, true, average='weighted'))
    f1.append(f1_score(pred, true, average='weighted'))
    recall.append(recall_score(pred, true, average='weighted'))
    confusion = confusion_matrix(pred, true)
    print(f"Confusion Matrix for Batch = {i} is {confusion}")

print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {test_loss/len(data['Description'])}")

Confusion Matrix for Batch = 0 is [[1044  246  279  281]
 [ 337 1336  215  283]
 [ 191  102  945  308]
 [ 328  216  461 1028]]
Testing Set Accuracy = 0.5727631578947369	Recall = 0.5727631578947369	Precision = 0.5805228531855956	F1-Score = 0.5744343303162504	Loss = 0.00013605097774416208


# Context Window = 2 Ends

In [4]:
svd_embeddings = torch.load("svd-word-vectors3.pt")
word_indices = torch.load("svd-word-vectors-indices3.pt")
svd_embeddings, word_indices

(array([[-9.01066197e-15, -1.71121577e-13, -8.82430860e-14, ...,
          1.57304066e-13, -4.09256349e-14, -2.39124909e-14],
        [ 5.80255324e-28,  8.16976304e-15,  2.93527932e-14, ...,
         -9.44907305e-13, -4.64221078e-13,  5.41019825e-15],
        [-4.15031838e-18, -5.47289015e-18, -2.32481956e-30, ...,
          2.13290499e-19,  2.33853486e-20, -1.10237769e-19],
        ...,
        [-8.05791672e-20, -1.38983849e-19,  7.40694056e-21, ...,
          5.20207690e-21,  6.06631818e-22, -2.64448132e-21],
        [-1.03028247e-19, -1.78149804e-19,  8.99622451e-21, ...,
          7.13478398e-21,  7.89125644e-22, -3.70537474e-21],
        [ 3.53817378e-20,  6.15234070e-20, -2.96573923e-21, ...,
         -2.57094333e-21, -2.84597033e-22,  1.34442118e-21]]),
 {'Reuters': 0,
  '-': 1,
  'Short': 2,
  'sellers': 3,
  ',': 4,
  'Wall': 5,
  "Street's": 6,
  'dwindling': 7,
  'band': 8,
  'of': 9,
  'ultra': 10,
  'cynics': 11,
  'are': 12,
  'seeing': 13,
  'green': 14,
  'again': 15,
 

In [11]:
hidden_dim = 256
output_dim = data['Class Index'].unique().shape[0]

model = LSTM(hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
model

LSTM(
  (lstm): LSTM(50, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [12]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-batch_size*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.28546666666666665	Loss = 0.013668730854988098
Epoch = 1	Training Set Accuracy = 0.428575	Loss = 0.012278773821890354
Epoch = 2	Training Set Accuracy = 0.496325	Loss = 0.011446230113506317
Epoch = 3	Training Set Accuracy = 0.5346833333333333	Loss = 0.010862610302865505
Epoch = 4	Training Set Accuracy = 0.5588916666666667	Loss = 0.010430552996695042
Epoch = 5	Training Set Accuracy = 0.5763583333333333	Loss = 0.010123650543391705
Epoch = 6	Training Set Accuracy = 0.5918583333333334	Loss = 0.00981556810438633
Epoch = 7	Training Set Accuracy = 0.60475	Loss = 0.009563560597598553
Epoch = 8	Training Set Accuracy = 0.618425	Loss = 0.0093050142750144
Epoch = 9	Training Set Accuracy = 0.6290916666666667	Loss = 0.009086902253329754


In [13]:
batch_size = 100
train_loss = 0
correct = 0
model.eval()
for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                batch_embeddings[k-batch_size*i][j] = oov_embeddings
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.Tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)]).to(torch.int64)
    loss = criterion(y_pred, y_true-1)
    train_loss += loss

    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()
print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tLoss = {train_loss/len(data['Description'])}")

Testing Set Accuracy = 0.5621052631578948	Loss = 0.010653287172317505


In [16]:
batch_size = 100
model.train()
precision = []
f1 = []
recall = []
correct = 0
train_loss = 0
import warnings
warnings.filterwarnings('ignore')


confusion = None
with torch.no_grad():
    for i in range(int(len(data_train['Description'])/batch_size)):
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()

        pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
        true = y_true.cpu().numpy()
        precision.append(precision_score(pred, true, average='weighted'))
        f1.append(f1_score(pred, true, average='weighted'))
        recall.append(recall_score(pred, true, average='weighted'))
        confusion = confusion_matrix(pred, true)
        print(f"Confusion Matrix for Batch = {i} is {confusion}")

    print(f"Training Set Accuracy = {correct / len(data_train['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {train_loss/len(data_train['Description'])}\tConfusion Matrix = {confusion}")

Confusion Matrix for Batch = 0 is [[ 0  0 15  7]
 [ 0  0 15  1]
 [ 0  0 17  4]
 [ 0  0 31 10]]
Confusion Matrix for Batch = 1 is [[ 0  0  0 26]
 [ 0  0  0 11]
 [ 0  0  0  7]
 [ 0  0  0 56]]
Confusion Matrix for Batch = 2 is [[ 0  0  0 17]
 [ 0  0  0 12]
 [ 0  0  0  9]
 [ 0  0  0 62]]
Confusion Matrix for Batch = 3 is [[ 0  0  0 11]
 [ 0  0  0  7]
 [ 0  0  0 14]
 [ 0  0  0 68]]
Confusion Matrix for Batch = 4 is [[ 7  3  0  5]
 [ 0 37  0  5]
 [ 0  0  0  7]
 [ 1  4  0 31]]
Confusion Matrix for Batch = 5 is [[44  5  2  4]
 [ 5  9  0  0]
 [ 6  1  4  1]
 [ 9  2  0  8]]
Confusion Matrix for Batch = 6 is [[26  5  4  1]
 [ 2 33  2  0]
 [ 4  0  7  1]
 [ 9  3  2  1]]
Confusion Matrix for Batch = 7 is [[30  3  7  0]
 [ 3 19  1  0]
 [ 4  1  6  4]
 [ 2  2  5 13]]
Confusion Matrix for Batch = 8 is [[20  3  4  3]
 [ 3  5  1  0]
 [ 2  0 28  3]
 [ 1  0  2 25]]
Confusion Matrix for Batch = 9 is [[24  0  2  8]
 [ 4  4  1  4]
 [ 2  0 14  5]
 [ 4  3  4 21]]
Confusion Matrix for Batch = 10 is [[22  7  2  3]


In [17]:
batch_size = len(data['Description'])
model.eval()
correct = 0
test_loss = 0
precision = []
f1 = []
recall = []


for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
            else:
                batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()

    pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
    true = y_true.cpu().numpy()
    precision.append(precision_score(pred, true, average='weighted'))
    f1.append(f1_score(pred, true, average='weighted'))
    recall.append(recall_score(pred, true, average='weighted'))
    confusion = confusion_matrix(pred, true)
    print(f"Confusion Matrix for Batch = {i} is {confusion}")

print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {test_loss/len(data['Description'])}")

Confusion Matrix for Batch = 0 is [[1010  256  281  286]
 [ 316 1302  200  247]
 [ 244  107  939  346]
 [ 330  235  480 1021]]
Testing Set Accuracy = 0.5621052631578948	Recall = 0.5621052631578948	Precision = 0.5668663434903047	F1-Score = 0.5632476957910101	Loss = 0.00014017483044881374
