In [2]:
import torch
import numpy as np
import math
import torch.nn as nn
import pandas as pd
import re

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
skip_gram_embeddings = torch.load("skip-gram-word-vectors1.pt")
skip_gram_embeddings

{'Reuters': array([-0.485021  , -0.22831962,  0.37018558,  0.6738861 ,  0.02729357,
        -0.24505594,  0.08561967, -0.28748474,  0.2091244 , -0.6306353 ,
        -0.775046  ,  0.5930816 ,  0.23167622,  1.0884048 ,  0.1531802 ,
        -1.0949636 ,  0.7206871 ,  0.31305924, -0.16668054,  0.33939946,
         0.27809626, -0.15041062, -0.32272533, -0.13966677,  0.07866573,
        -0.275185  , -0.45511195, -0.42056105,  0.77432925, -0.03655996,
        -0.11016252,  0.21615253, -0.28735617, -0.01562322,  0.05912277,
        -0.47739983, -0.59000796,  0.16255237, -0.51498735,  0.20325118,
        -0.11311341,  0.35369092,  0.03094628,  0.01079833, -0.08045641,
        -0.76955664, -0.537272  , -0.05266323,  0.18675391,  0.14749959],
       dtype=float32),
 '-': array([ 0.06046658,  0.11317125,  0.09143232, -0.05367319,  0.02383963,
         0.06754354, -0.111431  ,  0.02517351,  0.1519315 ,  0.05162168,
         0.14240478,  0.0061157 , -0.09423324,  0.10989027, -0.02062796,
        -0.

In [12]:
class LSTM(nn.Module):
    def __init__(self, hidden_dim, output_dim, embedding_dim = 50, num_layers = 1):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        
        # Initialize hidden state
        output, (hn, cn) = self.lstm(x, (h0, c0))
        
        # Take the output from the last time step

        output_last = output[:, -1]
        # Pass it through the fully connected layer
        output_fc = self.fc(output_last)
        
        return output_fc

        # output, (_, _) = self.lstm(x)
        # return self.fc(output)

In [7]:
skip_gram_embeddings['OOV'] = torch.randn(50, dtype = torch.float32)

In [8]:
data_train = pd.read_csv('train.csv')
length = []
for i in range(len(data_train['Description'])):
    sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][i])
    length.append(len(sentence))
length.sort()
max_len = length[int(0.95*len(length))]
max_len

59

In [9]:
data = pd.read_csv('test.csv')
data

Unnamed: 0,Class Index,Description
0,3,Unions representing workers at Turner Newall...
1,4,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,AP - A company founded by a chemistry research...
3,4,AP - It's barely dawn when Mike Fitzpatrick st...
4,4,AP - Southern California's smog-fighting agenc...
...,...,...
7595,1,Ukrainian presidential candidate Viktor Yushch...
7596,2,With the supply of attractive pitching options...
7597,2,Like Roger Clemens did almost exactly eight ye...
7598,3,SINGAPORE : Doctors in the United States have ...


In [13]:
hidden_dim = 256
output_dim = data['Class Index'].unique().shape[0]

model = LSTM(hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
model

LSTM(
  (lstm): LSTM(50, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [14]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in skip_gram_embeddings):
                    batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.24463333333333334	Loss = 0.01386810652911663
Epoch = 1	Training Set Accuracy = 0.25096666666666667	Loss = 0.013823936693370342
Epoch = 2	Training Set Accuracy = 0.299725	Loss = 0.013534355908632278
Epoch = 3	Training Set Accuracy = 0.4957	Loss = 0.011000053957104683
Epoch = 4	Training Set Accuracy = 0.7821	Loss = 0.005929304286837578
Epoch = 5	Training Set Accuracy = 0.8516666666666667	Loss = 0.004233403597027063
Epoch = 6	Training Set Accuracy = 0.8811083333333334	Loss = 0.0034833261743187904
Epoch = 7	Training Set Accuracy = 0.9015666666666666	Loss = 0.00294856377877295
Epoch = 8	Training Set Accuracy = 0.9170166666666667	Loss = 0.002536778338253498
Epoch = 9	Training Set Accuracy = 0.928975	Loss = 0.0021996768191456795


In [16]:
batch_size = 100
model.eval()
correct = 0
test_loss = 0

for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in skip_gram_embeddings):
                batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()
print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tLoss = {test_loss/len(data['Description'])}")

Testing Set Accuracy = 0.8598684210526316	Loss = 0.004514516331255436


In [19]:
from sklearn.metrics import precision_score, f1_score, recall_score, confusion_matrix

In [14]:
torch.save(model, "skip-gram-classification-model.pt")

In [20]:
batch_size = 100
model.train()
precision = []
f1 = []
recall = []
correct = 0
train_loss = 0
import warnings
warnings.filterwarnings('ignore')


confusion = None
with torch.no_grad():
    for i in range(int(len(data_train['Description'])/batch_size)):
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in skip_gram_embeddings):
                    batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()

        pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
        true = y_true.cpu().numpy()
        precision.append(precision_score(pred, true, average='weighted'))
        f1.append(f1_score(pred, true, average='weighted'))
        recall.append(recall_score(pred, true, average='weighted'))
        confusion = confusion_matrix(pred, true)
        print(f"Confusion Matrix for Batch = {i} is {confusion}")

    print(f"Training Set Accuracy = {correct / len(data_train['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {train_loss/len(data_train['Description'])}\tConfusion Matrix = {confusion}")

Confusion Matrix for Batch = 0 is [[ 0  0  5  3]
 [ 0  0  0  1]
 [ 0  0 61  1]
 [ 0  0 12 17]]
Confusion Matrix for Batch = 1 is [[ 0  0 14]
 [ 0  0  1]
 [ 0  0 85]]
Confusion Matrix for Batch = 2 is [[ 0  0  0  7]
 [ 0  0  0  4]
 [ 0  0  0  5]
 [ 0  0  0 84]]
Confusion Matrix for Batch = 3 is [[ 0  0  0  4]
 [ 0  0  0  5]
 [ 0  0  0  9]
 [ 0  0  0 82]]
Confusion Matrix for Batch = 4 is [[ 8  1  0  1]
 [ 0 42  0  1]
 [ 0  0  0  4]
 [ 0  1  0 42]]
Confusion Matrix for Batch = 5 is [[52  1  1  1]
 [ 8 16  0  1]
 [ 4  0  5  1]
 [ 0  0  0 10]]
Confusion Matrix for Batch = 6 is [[31  0  1  0]
 [ 6 41  0  0]
 [ 2  0 14  0]
 [ 2  0  0  3]]
Confusion Matrix for Batch = 7 is [[36  0  2  1]
 [ 1 25  0  0]
 [ 2  0 13  3]
 [ 0  0  4 13]]
Confusion Matrix for Batch = 8 is [[24  0  0  0]
 [ 1  8  0  1]
 [ 1  0 34  1]
 [ 0  0  1 29]]
Confusion Matrix for Batch = 9 is [[28  0  1  3]
 [ 4  7  1  1]
 [ 2  0 18  3]
 [ 0  0  1 31]]
Confusion Matrix for Batch = 10 is [[27  1  1  3]
 [ 3 19  0  1]
 [ 3  0 1

In [21]:
batch_size = len(data['Description'])
model.eval()
correct = 0
test_loss = 0
precision = []
f1 = []
recall = []


for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in skip_gram_embeddings):
                batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()

    pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
    true = y_true.cpu().numpy()
    precision.append(precision_score(pred, true, average='weighted'))
    f1.append(f1_score(pred, true, average='weighted'))
    recall.append(recall_score(pred, true, average='weighted'))
    confusion = confusion_matrix(pred, true)
    print(f"Confusion Matrix for Batch = {i} is {confusion}")

print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {test_loss/len(data['Description'])}")

Confusion Matrix for Batch = 0 is [[1656   53   90  130]
 [  84 1808   46   65]
 [ 106   24 1633  267]
 [  54   15  131 1438]]
Testing Set Accuracy = 0.8598684210526316	Recall = 0.8598684210526316	Precision = 0.8647009695290859	F1-Score = 0.8608927130970534	Loss = 5.9401521866675466e-05


# Context Window = 1 Ends

In [23]:
skip_gram_embeddings = torch.load("skip-gram-word-vectors2.pt")
skip_gram_embeddings

{'Reuters': array([ 0.7636274 , -0.01760704,  0.10106981,  1.1672716 , -0.12675798,
        -0.23597664,  0.02369807, -0.4138495 ,  0.5162099 ,  0.14314738,
        -0.06102663, -0.45349666, -0.44845033,  0.26092783,  0.19941507,
         0.10213922,  0.18474385,  0.96512777,  0.3281345 , -0.29805392,
         0.22238177, -0.05911426, -0.08121579, -0.32284373, -0.06514688,
         0.03500207, -0.37231332, -0.2370133 , -0.56966174, -0.05508831,
         0.449918  , -0.02793967, -0.16277455, -0.29195547,  1.0260798 ,
         0.72269547,  0.36685672, -0.1694271 , -0.16824317, -0.17533356,
        -0.02780384,  0.00672828, -0.48418033, -0.8101055 , -0.74108505,
         0.09441836,  0.23502359, -0.64683944, -0.07770781, -0.23833114],
       dtype=float32),
 '-': array([-0.02509637, -0.28923526, -0.00323538,  0.32122934, -0.12708332,
        -0.10712501,  0.0173927 , -0.24429137,  0.03293813, -0.09965374,
         0.04424008, -0.2287708 ,  0.11736935,  0.2804739 ,  0.42755574,
        -0.

In [24]:
skip_gram_embeddings['OOV'] = torch.randn(50, dtype = torch.float32)

In [25]:
hidden_dim = 256
output_dim = data['Class Index'].unique().shape[0]

model = LSTM(hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
model

LSTM(
  (lstm): LSTM(50, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [26]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in skip_gram_embeddings):
                    batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.245625	Loss = 0.013868843205273151
Epoch = 1	Training Set Accuracy = 0.2529666666666667	Loss = 0.013833113946020603
Epoch = 2	Training Set Accuracy = 0.30345833333333333	Loss = 0.013598411343991756
Epoch = 3	Training Set Accuracy = 0.575175	Loss = 0.009768212214112282
Epoch = 4	Training Set Accuracy = 0.81475	Loss = 0.0050962818786501884
Epoch = 5	Training Set Accuracy = 0.8668833333333333	Loss = 0.003797404235228896
Epoch = 6	Training Set Accuracy = 0.8927333333333334	Loss = 0.0031310743652284145
Epoch = 7	Training Set Accuracy = 0.9114333333333333	Loss = 0.0026386405806988478
Epoch = 8	Training Set Accuracy = 0.9247083333333334	Loss = 0.00226288172416389
Epoch = 9	Training Set Accuracy = 0.9377833333333333	Loss = 0.0019142128294333816


In [27]:
batch_size = 100
model.eval()
correct = 0
test_loss = 0

for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in skip_gram_embeddings):
                batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()
print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tLoss = {test_loss/len(data['Description'])}")

Testing Set Accuracy = 0.8568421052631578	Loss = 0.005296119023114443


In [28]:
batch_size = 100
model.train()
precision = []
f1 = []
recall = []
correct = 0
train_loss = 0
import warnings
warnings.filterwarnings('ignore')


confusion = None
with torch.no_grad():
    for i in range(int(len(data_train['Description'])/batch_size)):
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in skip_gram_embeddings):
                    batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()

        pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
        true = y_true.cpu().numpy()
        precision.append(precision_score(pred, true, average='weighted'))
        f1.append(f1_score(pred, true, average='weighted'))
        recall.append(recall_score(pred, true, average='weighted'))
        confusion = confusion_matrix(pred, true)
        print(f"Confusion Matrix for Batch = {i} is {confusion}")

    print(f"Training Set Accuracy = {correct / len(data_train['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {train_loss/len(data_train['Description'])}\tConfusion Matrix = {confusion}")

Confusion Matrix for Batch = 0 is [[ 0  0  5  0]
 [ 0  0  0  1]
 [ 0  0 65  0]
 [ 0  0  8 21]]
Confusion Matrix for Batch = 1 is [[ 0  0  0 15]
 [ 0  0  0  3]
 [ 0  0  0  2]
 [ 0  0  0 80]]
Confusion Matrix for Batch = 2 is [[ 0  0  0  5]
 [ 0  0  0  9]
 [ 0  0  0  5]
 [ 0  0  0 81]]
Confusion Matrix for Batch = 3 is [[ 0  0  0  7]
 [ 0  0  0  6]
 [ 0  0  0  8]
 [ 0  0  0 79]]
Confusion Matrix for Batch = 4 is [[ 8  1  0  2]
 [ 0 43  0  4]
 [ 0  0  0  3]
 [ 0  0  0 39]]
Confusion Matrix for Batch = 5 is [[53  1  0  0]
 [ 7 16  0  2]
 [ 4  0  6  0]
 [ 0  0  0 11]]
Confusion Matrix for Batch = 6 is [[29  0  0  0]
 [ 8 41  0  1]
 [ 3  0 15  0]
 [ 1  0  0  2]]
Confusion Matrix for Batch = 7 is [[35  0  1  0]
 [ 1 25  1  2]
 [ 3  0 14  4]
 [ 0  0  3 11]]
Confusion Matrix for Batch = 8 is [[23  0  0  1]
 [ 2  8  0  1]
 [ 1  0 35  0]
 [ 0  0  0 29]]
Confusion Matrix for Batch = 9 is [[28  0  0  4]
 [ 4  7  0  2]
 [ 2  0 18  3]
 [ 0  0  3 29]]
Confusion Matrix for Batch = 10 is [[29  0  0  1]


In [29]:
batch_size = len(data['Description'])
model.eval()
correct = 0
test_loss = 0
precision = []
f1 = []
recall = []


for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in skip_gram_embeddings):
                batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()

    pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
    true = y_true.cpu().numpy()
    precision.append(precision_score(pred, true, average='weighted'))
    f1.append(f1_score(pred, true, average='weighted'))
    recall.append(recall_score(pred, true, average='weighted'))
    confusion = confusion_matrix(pred, true)
    print(f"Confusion Matrix for Batch = {i} is {confusion}")

print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {test_loss/len(data['Description'])}")

Confusion Matrix for Batch = 0 is [[1640   33   89  127]
 [ 103 1822   59   88]
 [ 108   36 1636  271]
 [  49    9  116 1414]]
Testing Set Accuracy = 0.8568421052631578	Recall = 0.8568421052631578	Precision = 0.8638512465373962	F1-Score = 0.8581816337286763	Loss = 6.968576781218871e-05


# Context Window = 2 Ends

In [30]:
skip_gram_embeddings = torch.load("skip-gram-word-vectors3.pt")
skip_gram_embeddings

{'Reuters': array([ 0.12625949, -0.20541026,  0.06544121, -0.06943685, -0.26494062,
        -0.27261248,  0.21818571, -0.03563495, -0.35193318, -0.11749578,
        -0.01266543, -0.05049492, -0.25902155, -0.21637039, -0.22595885,
        -0.39755803, -0.4934507 , -0.18278563, -0.15559642,  0.00743056,
        -0.1800574 , -0.05788817, -0.04657734,  0.15675008, -0.33874685,
        -0.09521841, -0.42909387,  0.567192  , -0.08897841, -0.20097724,
         0.41537738, -0.3775598 , -0.48451504, -0.26029035,  0.13610536,
        -0.27856705,  0.44605678, -0.29937077,  0.1553495 ,  0.48205084,
         0.00084167,  0.05268196, -0.10904583,  0.58368266, -0.06759404,
         0.16011265,  0.01210472, -0.31068686, -0.35882956,  0.21053262],
       dtype=float32),
 '-': array([ 0.26248774, -0.17456482,  0.11316085, -0.03484554, -0.18866777,
        -0.19673875,  0.23122594, -0.08101395,  0.03163633, -0.24538067,
        -0.03245716, -0.12655213, -0.07073623, -0.31168276, -0.19963509,
        -0.

In [31]:
skip_gram_embeddings['OOV'] = torch.randn(50, dtype = torch.float32)

In [32]:
hidden_dim = 256
output_dim = data['Class Index'].unique().shape[0]

model = LSTM(hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
model

LSTM(
  (lstm): LSTM(50, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [33]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in skip_gram_embeddings):
                    batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.2434	Loss = 0.013869629241526127
Epoch = 1	Training Set Accuracy = 0.2528	Loss = 0.013832208700478077
Epoch = 2	Training Set Accuracy = 0.26056666666666667	Loss = 0.013784845359623432
Epoch = 3	Training Set Accuracy = 0.2645416666666667	Loss = 0.013666863553225994
Epoch = 4	Training Set Accuracy = 0.27468333333333333	Loss = 0.013592998497188091
Epoch = 5	Training Set Accuracy = 0.4104083333333333	Loss = 0.012112616561353207
Epoch = 6	Training Set Accuracy = 0.6701166666666667	Loss = 0.007833418436348438
Epoch = 7	Training Set Accuracy = 0.8117833333333333	Loss = 0.005126394331455231
Epoch = 8	Training Set Accuracy = 0.8618916666666666	Loss = 0.003965793177485466
Epoch = 9	Training Set Accuracy = 0.8882333333333333	Loss = 0.003299793228507042


In [34]:
batch_size = 100
model.eval()
correct = 0
test_loss = 0

for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in skip_gram_embeddings):
                batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()
print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tLoss = {test_loss/len(data['Description'])}")

Testing Set Accuracy = 0.8581578947368421	Loss = 0.004256037995219231


In [35]:
batch_size = 100
model.train()
precision = []
f1 = []
recall = []
correct = 0
train_loss = 0
import warnings
warnings.filterwarnings('ignore')


confusion = None
with torch.no_grad():
    for i in range(int(len(data_train['Description'])/batch_size)):
        batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in skip_gram_embeddings):
                    batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()

        pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
        true = y_true.cpu().numpy()
        precision.append(precision_score(pred, true, average='weighted'))
        f1.append(f1_score(pred, true, average='weighted'))
        recall.append(recall_score(pred, true, average='weighted'))
        confusion = confusion_matrix(pred, true)
        print(f"Confusion Matrix for Batch = {i} is {confusion}")

    print(f"Training Set Accuracy = {correct / len(data_train['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {train_loss/len(data_train['Description'])}\tConfusion Matrix = {confusion}")

Confusion Matrix for Batch = 0 is [[ 0  0  8  2]
 [ 0  0  2  1]
 [ 0  0 36  0]
 [ 0  0 32 19]]
Confusion Matrix for Batch = 1 is [[ 0  0  0 12]
 [ 0  0  0  1]
 [ 0  0  0  1]
 [ 0  0  0 86]]
Confusion Matrix for Batch = 2 is [[ 0  0  0  3]
 [ 0  0  0  3]
 [ 0  0  0  3]
 [ 0  0  0 91]]
Confusion Matrix for Batch = 3 is [[ 0  0  0  5]
 [ 0  0  0  3]
 [ 0  0  0  3]
 [ 0  0  0 89]]
Confusion Matrix for Batch = 4 is [[ 8  1  0  1]
 [ 0 41  0  0]
 [ 0  0  0  1]
 [ 0  2  0 46]]
Confusion Matrix for Batch = 5 is [[49  1  2  1]
 [ 6 16  0  1]
 [ 4  0  4  0]
 [ 5  0  0 11]]
Confusion Matrix for Batch = 6 is [[32  0  1  1]
 [ 6 40  0  0]
 [ 1  1 13  0]
 [ 2  0  1  2]]
Confusion Matrix for Batch = 7 is [[35  0  3  0]
 [ 1 25  1  1]
 [ 3  0 10  2]
 [ 0  0  5 14]]
Confusion Matrix for Batch = 8 is [[22  0  1  0]
 [ 3  8  0  1]
 [ 1  0 33  0]
 [ 0  0  1 30]]
Confusion Matrix for Batch = 9 is [[28  0  0  2]
 [ 2  7  0  0]
 [ 3  0 13  3]
 [ 1  0  8 33]]
Confusion Matrix for Batch = 10 is [[27  0  2  1]


In [36]:
batch_size = len(data['Description'])
model.eval()
correct = 0
test_loss = 0
precision = []
f1 = []
recall = []


for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 50, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in skip_gram_embeddings):
                batch_embeddings[k-batch_size*i][j] = skip_gram_embeddings['OOV']
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(skip_gram_embeddings[word])

        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = skip_gram_embeddings['OOV']
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()

    pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
    true = y_true.cpu().numpy()
    precision.append(precision_score(pred, true, average='weighted'))
    f1.append(f1_score(pred, true, average='weighted'))
    recall.append(recall_score(pred, true, average='weighted'))
    confusion = confusion_matrix(pred, true)
    print(f"Confusion Matrix for Batch = {i} is {confusion}")

print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {test_loss/len(data['Description'])}")

Confusion Matrix for Batch = 0 is [[1621   36   99   80]
 [  83 1791   34   61]
 [  76   13 1475  124]
 [ 120   60  292 1635]]
Testing Set Accuracy = 0.8581578947368421	Recall = 0.8581578947368421	Precision = 0.861314404432133	F1-Score = 0.858353144400639	Loss = 5.600050644716248e-05


# Context Window = 3 Ends