In [2]:
import torch
import numpy as np
import math
import torch.nn as nn
import pandas as pd
import re

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
svd_embeddings = torch.load("svd-word-vectors.pt")
word_indices = torch.load("svd-word-vectors-indices.pt")
svd_embeddings, word_indices

(array([[-3.06209059e-28,  1.55862194e-15,  1.48228265e-15, ...,
         -8.88286768e-27, -3.79311871e-28, -1.07572364e-26],
        [-2.33501757e-26, -6.09764161e-15, -2.05589291e-14, ...,
         -5.81644059e-13, -4.96247181e-13,  5.01109433e-13],
        [ 7.76901681e-17, -3.09781264e-17,  3.89347301e-28, ...,
         -2.50356360e-19, -2.11879916e-19,  9.58930671e-20],
        ...,
        [-1.00340075e-18,  3.47433168e-19,  1.00425307e-18, ...,
         -3.50341472e-22, -4.94694982e-22, -4.06641052e-22],
        [-3.91879347e-18,  1.36949864e-18,  3.94301599e-18, ...,
         -1.40136589e-21, -9.43371826e-22, -5.96406876e-21],
        [-3.31833318e-18,  1.03884491e-18,  3.01707567e-18, ...,
         -3.21519591e-32,  3.16374698e-21, -1.26510549e-21]]),
 {'Reuters': 0,
  '-': 1,
  'Short': 2,
  'sellers': 3,
  ',': 4,
  'Wall': 5,
  "Street's": 6,
  'dwindling': 7,
  'band': 8,
  'of': 9,
  'ultra': 10,
  'cynics': 11,
  'are': 12,
  'seeing': 13,
  'green': 14,
  'again': 15,
 

In [5]:
class LSTM(nn.Module):
    def __init__(self, hidden_dim, output_dim, embedding_dim = 300, num_layers = 1):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        
        # Initialize hidden state
        output, (hn, cn) = self.lstm(x, (h0, c0))
        
        # Take the output from the last time step

        output_last = output[:, -1]
        # Pass it through the fully connected layer
        output_fc = self.fc(output_last)
        
        return output_fc

        # output, (_, _) = self.lstm(x)
        # return self.fc(output)

In [6]:
oov_embeddings = torch.randn(300, dtype = torch.float32)

In [7]:
data_train = pd.read_csv('train.csv')
length = []
for i in range(len(data_train['Description'])):
    sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][i])
    length.append(len(sentence))
length.sort()
max_len = length[int(0.95*len(length))]
max_len

59

In [8]:
data = pd.read_csv('test.csv')
data

Unnamed: 0,Class Index,Description
0,3,Unions representing workers at Turner Newall...
1,4,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,AP - A company founded by a chemistry research...
3,4,AP - It's barely dawn when Mike Fitzpatrick st...
4,4,AP - Southern California's smog-fighting agenc...
...,...,...
7595,1,Ukrainian presidential candidate Viktor Yushch...
7596,2,With the supply of attractive pitching options...
7597,2,Like Roger Clemens did almost exactly eight ye...
7598,3,SINGAPORE : Doctors in the United States have ...


In [9]:
hidden_dim = 256
output_dim = data['Class Index'].unique().shape[0]

model = LSTM(hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
model

LSTM(
  (lstm): LSTM(300, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [10]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 300, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-batch_size*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])

            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.3336166666666667	Loss = 0.013294828124344349
Epoch = 1	Training Set Accuracy = 0.6086583333333333	Loss = 0.00969003513455391
Epoch = 2	Training Set Accuracy = 0.6924083333333333	Loss = 0.007970205508172512
Epoch = 3	Training Set Accuracy = 0.7281666666666666	Loss = 0.007157212123274803
Epoch = 4	Training Set Accuracy = 0.7488416666666666	Loss = 0.0067024254240095615
Epoch = 5	Training Set Accuracy = 0.7609416666666666	Loss = 0.006355278193950653
Epoch = 6	Training Set Accuracy = 0.7743666666666666	Loss = 0.006042088847607374
Epoch = 7	Training Set Accuracy = 0.7860916666666666	Loss = 0.005776815582066774
Epoch = 8	Training Set Accuracy = 0.7979666666666667	Loss = 0.005496237892657518
Epoch = 9	Training Set Accuracy = 0.8071083333333333	Loss = 0.005240066908299923


In [11]:
batch_size = 100
train_loss = 0
correct = 0
model.eval()
for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 300, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                batch_embeddings[k-batch_size*i][j] = oov_embeddings
            else:
                batch_embeddings[k-batch_size*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-batch_size*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.Tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)]).to(torch.int64)
    loss = criterion(y_pred, y_true-1)
    train_loss += loss

    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()
print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tLoss = {train_loss/len(data['Description'])}")

Testing Set Accuracy = 0.7275	Loss = 0.007872769609093666


In [12]:
from sklearn.metrics import precision_score, f1_score, recall_score, confusion_matrix

In [13]:
torch.save(model, "svd-classification-model.pt")

In [10]:
batch_size = 100
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    correct = 0
    train_loss = 0

    for i in range(int(len(data_train['Description'])/batch_size)):
        optimizer.zero_grad()
        batch_embeddings = torch.zeros(batch_size, max_len, 300, dtype = torch.float32, device = device)

        for k in range(100*i, 100*i+100):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[100*i:100*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()
        loss.backward()
        optimizer.step()
    print(f"Epoch = {epoch}\tTraining Set Accuracy = {correct / len(data_train['Description'])}\tLoss = {train_loss/len(data_train['Description'])}")

Epoch = 0	Training Set Accuracy = 0.39574166666666666	Loss = 0.012476691044867039
Epoch = 1	Training Set Accuracy = 1.0512333333333332	Loss = 0.02108396217226982
Epoch = 2	Training Set Accuracy = 1.7664	Loss = 0.02847791463136673
Epoch = 3	Training Set Accuracy = 2.5099	Loss = 0.035228148102760315
Epoch = 4	Training Set Accuracy = 3.27145	Loss = 0.041528452187776566
Epoch = 5	Training Set Accuracy = 4.048466666666667	Loss = 0.047477059066295624
Epoch = 6	Training Set Accuracy = 4.839433333333333	Loss = 0.053097303956747055
Epoch = 7	Training Set Accuracy = 5.640633333333334	Loss = 0.0584648959338665
Epoch = 8	Training Set Accuracy = 6.449108333333333	Loss = 0.06365052610635757
Epoch = 9	Training Set Accuracy = 7.265825	Loss = 0.06865249574184418


In [14]:
batch_size = 100
model.train()
precision = []
f1 = []
recall = []
correct = 0
train_loss = 0
import warnings
warnings.filterwarnings('ignore')


confusion = None
with torch.no_grad():
    for i in range(int(len(data_train['Description'])/batch_size)):
        batch_embeddings = torch.zeros(batch_size, max_len, 300, dtype = torch.float32, device = device)

        for k in range(batch_size*i, batch_size*i+batch_size):
            sentence = re.findall(r"[\w']+|[.,!?;'-]", data_train['Description'][k])
            for j, word in enumerate(sentence):
                if (j >= max_len):
                    break
                if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
                else:
                    batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
            for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
                batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
        y_pred = model(batch_embeddings).to(device)

        # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
        y_true = torch.tensor(data_train['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
        loss = criterion(y_pred, y_true-1)
        train_loss += loss

        predicted = torch.argmax(y_pred, dim = 1)
        correct += torch.sum(predicted+1 == y_true).item()

        pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
        true = y_true.cpu().numpy()
        precision.append(precision_score(pred, true, average='weighted'))
        f1.append(f1_score(pred, true, average='weighted'))
        recall.append(recall_score(pred, true, average='weighted'))
        confusion = confusion_matrix(pred, true)
        print(f"Confusion Matrix for Batch = {i} is {confusion}")

    print(f"Training Set Accuracy = {correct / len(data_train['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {train_loss/len(data_train['Description'])}\tConfusion Matrix = {confusion}")

Confusion Matrix for Batch = 0 is [[ 0  0  6  2]
 [ 0  0  8  3]
 [ 0  0 33  6]
 [ 0  0 31 11]]
Confusion Matrix for Batch = 1 is [[ 0  0  0 12]
 [ 0  0  0  8]
 [ 0  0  0  4]
 [ 0  0  0 76]]
Confusion Matrix for Batch = 2 is [[ 0  0  0 16]
 [ 0  0  0  7]
 [ 0  0  0  7]
 [ 0  0  0 70]]
Confusion Matrix for Batch = 3 is [[ 0  0  0  4]
 [ 0  0  0  7]
 [ 0  0  0 12]
 [ 0  0  0 77]]
Confusion Matrix for Batch = 4 is [[ 7  0  0  4]
 [ 1 41  0  1]
 [ 0  3  0  0]
 [ 0  0  0 43]]
Confusion Matrix for Batch = 5 is [[55  1  2  2]
 [ 6 16  0  0]
 [ 2  0  4  1]
 [ 1  0  0 10]]
Confusion Matrix for Batch = 6 is [[32  1  3  2]
 [ 7 39  1  0]
 [ 1  0  9  0]
 [ 1  1  2  1]]
Confusion Matrix for Batch = 7 is [[30  1  2  3]
 [ 4 23  1  0]
 [ 4  1 11  3]
 [ 1  0  5 11]]
Confusion Matrix for Batch = 8 is [[22  2  1  1]
 [ 2  6  0  0]
 [ 2  0 33  2]
 [ 0  0  1 28]]
Confusion Matrix for Batch = 9 is [[28  1  0  2]
 [ 2  5  0  2]
 [ 3  0 16  4]
 [ 1  1  5 30]]
Confusion Matrix for Batch = 10 is [[28  1  4  3]


In [15]:
batch_size = len(data['Description'])
model.eval()
correct = 0
test_loss = 0
precision = []
f1 = []
recall = []


for i in range(int(len(data['Description'])/batch_size)):
    batch_embeddings = torch.zeros(batch_size, max_len, 300, dtype = torch.float32, device = device)

    for k in range(batch_size*i, batch_size*i+batch_size):
        sentence = re.findall(r"[\w']+|[.,!?;'-]", data['Description'][k])
        for j, word in enumerate(sentence):
            if (j >= max_len):
                break
            if (word not in word_indices):
                    batch_embeddings[k-100*i][j] = oov_embeddings
            else:
                batch_embeddings[k-100*i][j] = torch.from_numpy(svd_embeddings[word_indices[word]])
        for j in range(max(0, max_len-len(sentence))):                      # Pad the sentences with the PAD Token or OOV Embeddings
            batch_embeddings[k-100*i][j+len(sentence)] = oov_embeddings
    y_pred = model(batch_embeddings).to(device)

    # y_true = torch.Tensor(data_train['Class Index'][100*i:100*(i+1)]).to(torch.int64)
    y_true = torch.tensor(data['Class Index'].values[batch_size*i:batch_size*(i+1)], dtype=torch.int64, device=device)
    loss = criterion(y_pred, y_true-1)
    test_loss += loss
    predicted = torch.argmax(y_pred, dim = 1)
    correct += torch.sum(predicted+1 == y_true).item()

    pred = (predicted+torch.ones_like(predicted)).cpu().numpy()
    true = y_true.cpu().numpy()
    precision.append(precision_score(pred, true, average='weighted'))
    f1.append(f1_score(pred, true, average='weighted'))
    recall.append(recall_score(pred, true, average='weighted'))
    confusion = confusion_matrix(pred, true)
    print(f"Confusion Matrix for Batch = {i} is {confusion}")

print(f"Testing Set Accuracy = {correct / len(data['Description'])}\tRecall = {np.mean(recall)}\tPrecision = {np.mean(precision)}\tF1-Score = {np.mean(f1)}\tLoss = {test_loss/len(data['Description'])}")

Confusion Matrix for Batch = 0 is [[1437  144  197  186]
 [ 164 1586  125  145]
 [ 160   45 1272  335]
 [ 139  125  306 1234]]
Testing Set Accuracy = 0.7275	Recall = 0.7275	Precision = 0.731093351800554	F1-Score = 0.7288439959010665	Loss = 0.00010358908912166953
