In [2]:
import pandas as pd
import torch

df = pd.read_csv('train.csv')

df = df.dropna()
df = df.reset_index(drop=True)

In [3]:
import nltk
import unicodedata
import re

def normalize_unicode(s):
    return unicodedata.normalize('NFD', s)

def preprocess_text(text):
    text = normalize_unicode(text)
    text = re.sub(r"(.)(\1{2,})", r"\1", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", " ", text)
    text = text.strip().lower()
    return text

nltk.download('punkt')

df['Description'] = df['Description'].apply(preprocess_text)
df['Description'] = df['Description'].apply(nltk.word_tokenize)
df['Description'] = df['Description'].apply(lambda x: ['<sos>'] + x + ['<eos>'])

df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /home2/swethavipparla/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,Class Index,Description
0,3,"[<sos>, reuters, short, sellers, ,, wall, stre..."
1,3,"[<sos>, reuters, private, investment, firm, ca..."
2,3,"[<sos>, reuters, soaring, crude, prices, plus,..."
3,3,"[<sos>, reuters, authorities, have, halted, oi..."
4,3,"[<sos>, afp, tearaway, world, oil, prices, ,, ..."


In [4]:
train_set = df['Description']
pre_train_set = list(df['Description'][:8000])
pre_val_set = list(df['Description'][8000:10000])

In [5]:
from torch.utils.data import Dataset

class ELMO_Dataset(Dataset):
    def __init__(self, data, word_to_ix):
        self.data = data
        self.word_to_ix = word_to_ix
        self.indexed_data = [self.index_sentence(sentence) for sentence in data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.indexed_data[idx])

    def index_sentence(self, sentence):
        indexed_sentence = [self.word_to_ix.get(word, self.word_to_ix['<unk>']) for word in sentence]
        return indexed_sentence

    @staticmethod
    def create_vocab(data):
        vocab = set()
        for sentence in data:
            for word in sentence:
                vocab.add(word)

        vocab.add('<pad>')
        vocab.add('<unk>')
        vocab.add('<sos>')
        vocab.add('<eos>')
        return vocab

In [6]:
word_to_ix = {word: idx for idx, word in enumerate(ELMO_Dataset.create_vocab(train_set))}
print(len(word_to_ix))

59352


In [7]:
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    batch = sorted(batch, key=lambda x: x.shape[0], reverse=True)
    padded_batch = pad_sequence(batch, batch_first=True, padding_value=word_to_ix['<pad>'])
    lengths = torch.LongTensor([len(x) for x in batch])

    input_tensor = padded_batch[:, :-1]
    target_truth = padded_batch[:, 1:]

    return input_tensor, target_truth, lengths - 1

In [8]:
pre_train_dataset = ELMO_Dataset(pre_train_set, word_to_ix)

pre_val_dataset = ELMO_Dataset(pre_val_set, word_to_ix)

batch_size = 32

from torch.utils.data import DataLoader

pre_train_loader = DataLoader(pre_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4)
pre_val_loader = DataLoader(pre_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4)

In [12]:
import gensim
import gensim.downloader

glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')



In [13]:
import numpy as np

vocab_size = len(word_to_ix)
embedding_dim = glove_vectors.vector_size
embedding_matrix = torch.zeros(vocab_size, embedding_dim)

special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

average_vector = np.mean(glove_vectors.vectors, axis=0)

for word, i in word_to_ix.items():
    if word not in special_tokens:
        try:
            embedding_matrix[i] = torch.tensor(glove_vectors[word])
        except KeyError:
            embedding_matrix[i] = torch.tensor(average_vector)

    elif word == '<sos>' or word == '<eos>':
        embedding_matrix[i] = torch.randn(embedding_dim)
    elif word == '<unk>':
        embedding_matrix[i] = torch.tensor(average_vector)
    else:
        embedding_matrix[i] = torch.zeros(embedding_dim)

print(embedding_matrix.shape)

torch.Size([59352, 50])


In [14]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class ELMO(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_layers, dropout):
        super(ELMO, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_matrix.shape[1], hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, embedding_matrix.shape[0])

    def forward(self, input_tensor, lengths):
        embedded = self.embedding(input_tensor)
        packed_input = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=True)
        packed_output, _ = self.lstm(packed_input, None)

        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = self.linear(output)

        return output

In [15]:
model = ELMO(embedding_matrix, 300, 2, 0.5)

In [16]:
from tqdm import tqdm

In [17]:
def run_epoch(model, data_loader, loss_fn, epoch, optimizer=None):
    if optimizer:
        model.train()
    else:
        model.eval()

    total_loss = 0

    p_bar = tqdm(data_loader)
    for (input_tensor, target_truth, lengths) in p_bar:

        input_tensor = input_tensor.cuda()
        target_truth = target_truth.cuda()

        output = model(input_tensor, lengths)
        output = output.reshape(-1, output.shape[2])

        loss = loss_fn(output, target_truth.reshape(-1))
        total_loss += loss.item()

        if optimizer:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        mean_loss = total_loss / len(data_loader)

        p_bar.set_description(f'{"T" if optimizer else "V"} Loss: {mean_loss:.4f}, count: {epoch}')


    return mean_loss

In [18]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 50

model.cuda()

best_val_loss = float('inf')

all_val_loss = []
all_train_loss = []


for epoch in range(num_epochs):
    train_loss = run_epoch(model, pre_train_loader, loss_fn, epoch+1, optimizer)
    all_train_loss.append(train_loss)
    with torch.no_grad():
        val_loss = run_epoch(model, pre_val_loader, loss_fn, epoch+1)
        all_val_loss.append(val_loss)

    print('Epoch: {}, Train Loss: {:.4f}, Val Loss: {:.4f}'.format(epoch+1, train_loss, val_loss))
    if val_loss < best_val_loss:
        counter = 0
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_lstm_model.pth')
    else:
        counter += 1
        if counter == 3:
            break

T Loss: 8.9388, count: 1: 100%|██████████| 250/250 [00:30<00:00,  8.22it/s]
V Loss: 8.5080, count: 1: 100%|██████████| 63/63 [00:02<00:00, 26.35it/s]


Epoch: 1, Train Loss: 8.9388, Val Loss: 8.5080


T Loss: 7.8507, count: 2: 100%|██████████| 250/250 [00:27<00:00,  9.05it/s]
V Loss: 7.2585, count: 2: 100%|██████████| 63/63 [00:02<00:00, 26.05it/s]


Epoch: 2, Train Loss: 7.8507, Val Loss: 7.2585


T Loss: 6.7928, count: 3: 100%|██████████| 250/250 [00:27<00:00,  9.03it/s]
V Loss: 6.4858, count: 3: 100%|██████████| 63/63 [00:02<00:00, 26.02it/s]


Epoch: 3, Train Loss: 6.7928, Val Loss: 6.4858


T Loss: 6.0844, count: 4: 100%|██████████| 250/250 [00:28<00:00,  8.76it/s]
V Loss: 5.8313, count: 4: 100%|██████████| 63/63 [00:02<00:00, 26.01it/s]


Epoch: 4, Train Loss: 6.0844, Val Loss: 5.8313


T Loss: 5.5757, count: 5: 100%|██████████| 250/250 [00:28<00:00,  8.68it/s]
V Loss: 5.5919, count: 5: 100%|██████████| 63/63 [00:02<00:00, 25.30it/s]


Epoch: 5, Train Loss: 5.5757, Val Loss: 5.5919


T Loss: 5.1049, count: 6: 100%|██████████| 250/250 [00:28<00:00,  8.65it/s]
V Loss: 5.1599, count: 6: 100%|██████████| 63/63 [00:02<00:00, 26.01it/s]


Epoch: 6, Train Loss: 5.1049, Val Loss: 5.1599


T Loss: 4.7267, count: 7: 100%|██████████| 250/250 [00:29<00:00,  8.53it/s]
V Loss: 4.9137, count: 7: 100%|██████████| 63/63 [00:02<00:00, 25.49it/s]


Epoch: 7, Train Loss: 4.7267, Val Loss: 4.9137


T Loss: 4.3810, count: 8: 100%|██████████| 250/250 [00:28<00:00,  8.73it/s]
V Loss: 4.5877, count: 8: 100%|██████████| 63/63 [00:02<00:00, 25.06it/s]


Epoch: 8, Train Loss: 4.3810, Val Loss: 4.5877


T Loss: 4.0975, count: 9: 100%|██████████| 250/250 [00:28<00:00,  8.64it/s]
V Loss: 4.3831, count: 9: 100%|██████████| 63/63 [00:02<00:00, 25.61it/s]


Epoch: 9, Train Loss: 4.0975, Val Loss: 4.3831


T Loss: 3.8173, count: 10: 100%|██████████| 250/250 [00:29<00:00,  8.62it/s]
V Loss: 4.2147, count: 10: 100%|██████████| 63/63 [00:02<00:00, 25.29it/s]


Epoch: 10, Train Loss: 3.8173, Val Loss: 4.2147


T Loss: 3.5577, count: 11: 100%|██████████| 250/250 [00:28<00:00,  8.67it/s]
V Loss: 3.9709, count: 11: 100%|██████████| 63/63 [00:02<00:00, 25.13it/s]


Epoch: 11, Train Loss: 3.5577, Val Loss: 3.9709


T Loss: 3.3488, count: 12: 100%|██████████| 250/250 [00:28<00:00,  8.69it/s]
V Loss: 3.7398, count: 12: 100%|██████████| 63/63 [00:02<00:00, 25.22it/s]


Epoch: 12, Train Loss: 3.3488, Val Loss: 3.7398


T Loss: 3.1126, count: 13: 100%|██████████| 250/250 [00:28<00:00,  8.75it/s]
V Loss: 3.5090, count: 13: 100%|██████████| 63/63 [00:02<00:00, 25.15it/s]


Epoch: 13, Train Loss: 3.1126, Val Loss: 3.5090


T Loss: 2.9170, count: 14: 100%|██████████| 250/250 [00:28<00:00,  8.72it/s]
V Loss: 3.3759, count: 14: 100%|██████████| 63/63 [00:02<00:00, 25.80it/s]


Epoch: 14, Train Loss: 2.9170, Val Loss: 3.3759


T Loss: 2.7211, count: 15: 100%|██████████| 250/250 [00:28<00:00,  8.68it/s]
V Loss: 3.1422, count: 15: 100%|██████████| 63/63 [00:02<00:00, 25.23it/s]


Epoch: 15, Train Loss: 2.7211, Val Loss: 3.1422


T Loss: 2.5185, count: 16: 100%|██████████| 250/250 [00:28<00:00,  8.67it/s]
V Loss: 2.9881, count: 16: 100%|██████████| 63/63 [00:02<00:00, 24.87it/s]


Epoch: 16, Train Loss: 2.5185, Val Loss: 2.9881


T Loss: 2.2982, count: 17: 100%|██████████| 250/250 [00:28<00:00,  8.72it/s]
V Loss: 2.7977, count: 17: 100%|██████████| 63/63 [00:02<00:00, 26.25it/s]


Epoch: 17, Train Loss: 2.2982, Val Loss: 2.7977


T Loss: 2.0877, count: 18: 100%|██████████| 250/250 [00:28<00:00,  8.72it/s]
V Loss: 2.5670, count: 18: 100%|██████████| 63/63 [00:02<00:00, 25.32it/s]


Epoch: 18, Train Loss: 2.0877, Val Loss: 2.5670


T Loss: 1.9043, count: 19: 100%|██████████| 250/250 [00:28<00:00,  8.70it/s]
V Loss: 2.3434, count: 19: 100%|██████████| 63/63 [00:02<00:00, 25.52it/s]


Epoch: 19, Train Loss: 1.9043, Val Loss: 2.3434


T Loss: 1.6847, count: 20: 100%|██████████| 250/250 [00:28<00:00,  8.64it/s]
V Loss: 2.1348, count: 20: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]


Epoch: 20, Train Loss: 1.6847, Val Loss: 2.1348


T Loss: 1.4721, count: 21: 100%|██████████| 250/250 [00:28<00:00,  8.65it/s]
V Loss: 1.9507, count: 21: 100%|██████████| 63/63 [00:02<00:00, 25.32it/s]


Epoch: 21, Train Loss: 1.4721, Val Loss: 1.9507


T Loss: 1.2532, count: 22: 100%|██████████| 250/250 [00:29<00:00,  8.61it/s]
V Loss: 1.7740, count: 22: 100%|██████████| 63/63 [00:02<00:00, 25.89it/s]


Epoch: 22, Train Loss: 1.2532, Val Loss: 1.7740


T Loss: 1.0673, count: 23: 100%|██████████| 250/250 [00:28<00:00,  8.70it/s]
V Loss: 1.5983, count: 23: 100%|██████████| 63/63 [00:02<00:00, 25.62it/s]


Epoch: 23, Train Loss: 1.0673, Val Loss: 1.5983


T Loss: 0.8896, count: 24: 100%|██████████| 250/250 [00:28<00:00,  8.77it/s]
V Loss: 1.4224, count: 24: 100%|██████████| 63/63 [00:02<00:00, 25.69it/s]


Epoch: 24, Train Loss: 0.8896, Val Loss: 1.4224


T Loss: 0.7222, count: 25: 100%|██████████| 250/250 [00:28<00:00,  8.67it/s]
V Loss: 1.2797, count: 25: 100%|██████████| 63/63 [00:02<00:00, 25.97it/s]


Epoch: 25, Train Loss: 0.7222, Val Loss: 1.2797


T Loss: 0.5902, count: 26: 100%|██████████| 250/250 [00:29<00:00,  8.61it/s]
V Loss: 1.1607, count: 26: 100%|██████████| 63/63 [00:02<00:00, 24.97it/s]


Epoch: 26, Train Loss: 0.5902, Val Loss: 1.1607


T Loss: 0.4769, count: 27: 100%|██████████| 250/250 [00:29<00:00,  8.51it/s]
V Loss: 1.0473, count: 27: 100%|██████████| 63/63 [00:02<00:00, 26.55it/s]


Epoch: 27, Train Loss: 0.4769, Val Loss: 1.0473


T Loss: 0.3739, count: 28: 100%|██████████| 250/250 [00:28<00:00,  8.68it/s]
V Loss: 0.9593, count: 28: 100%|██████████| 63/63 [00:02<00:00, 25.61it/s]


Epoch: 28, Train Loss: 0.3739, Val Loss: 0.9593


T Loss: 0.2976, count: 29: 100%|██████████| 250/250 [00:28<00:00,  8.72it/s]
V Loss: 0.9101, count: 29: 100%|██████████| 63/63 [00:02<00:00, 25.22it/s]


Epoch: 29, Train Loss: 0.2976, Val Loss: 0.9101


T Loss: 0.2410, count: 30: 100%|██████████| 250/250 [00:29<00:00,  8.60it/s]
V Loss: 0.8522, count: 30: 100%|██████████| 63/63 [00:02<00:00, 26.48it/s]


Epoch: 30, Train Loss: 0.2410, Val Loss: 0.8522


T Loss: 0.1914, count: 31: 100%|██████████| 250/250 [00:28<00:00,  8.78it/s]
V Loss: 0.8046, count: 31: 100%|██████████| 63/63 [00:02<00:00, 25.75it/s]


Epoch: 31, Train Loss: 0.1914, Val Loss: 0.8046


T Loss: 0.1577, count: 32: 100%|██████████| 250/250 [00:29<00:00,  8.54it/s]
V Loss: 0.7811, count: 32: 100%|██████████| 63/63 [00:02<00:00, 24.67it/s]


Epoch: 32, Train Loss: 0.1577, Val Loss: 0.7811


T Loss: 0.1294, count: 33: 100%|██████████| 250/250 [00:28<00:00,  8.77it/s]
V Loss: 0.7603, count: 33: 100%|██████████| 63/63 [00:02<00:00, 25.94it/s]


Epoch: 33, Train Loss: 0.1294, Val Loss: 0.7603


T Loss: 0.1102, count: 34: 100%|██████████| 250/250 [00:29<00:00,  8.59it/s]
V Loss: 0.7429, count: 34: 100%|██████████| 63/63 [00:02<00:00, 25.87it/s]


Epoch: 34, Train Loss: 0.1102, Val Loss: 0.7429


T Loss: 0.0902, count: 35: 100%|██████████| 250/250 [00:28<00:00,  8.64it/s]
V Loss: 0.7427, count: 35: 100%|██████████| 63/63 [00:02<00:00, 24.79it/s]


Epoch: 35, Train Loss: 0.0902, Val Loss: 0.7427


T Loss: 0.0771, count: 36: 100%|██████████| 250/250 [00:28<00:00,  8.77it/s]
V Loss: 0.7226, count: 36: 100%|██████████| 63/63 [00:02<00:00, 26.08it/s]


Epoch: 36, Train Loss: 0.0771, Val Loss: 0.7226


T Loss: 0.0655, count: 37: 100%|██████████| 250/250 [00:28<00:00,  8.64it/s]
V Loss: 0.6913, count: 37: 100%|██████████| 63/63 [00:02<00:00, 25.69it/s]


Epoch: 37, Train Loss: 0.0655, Val Loss: 0.6913


T Loss: 0.0557, count: 38: 100%|██████████| 250/250 [00:28<00:00,  8.68it/s]
V Loss: 0.6867, count: 38: 100%|██████████| 63/63 [00:02<00:00, 25.16it/s]


Epoch: 38, Train Loss: 0.0557, Val Loss: 0.6867


T Loss: 0.0474, count: 39: 100%|██████████| 250/250 [00:28<00:00,  8.66it/s]
V Loss: 0.6883, count: 39: 100%|██████████| 63/63 [00:02<00:00, 25.68it/s]


Epoch: 39, Train Loss: 0.0474, Val Loss: 0.6883


T Loss: 0.0406, count: 40: 100%|██████████| 250/250 [00:28<00:00,  8.70it/s]
V Loss: 0.6773, count: 40: 100%|██████████| 63/63 [00:02<00:00, 24.88it/s]


Epoch: 40, Train Loss: 0.0406, Val Loss: 0.6773


T Loss: 0.0354, count: 41: 100%|██████████| 250/250 [00:28<00:00,  8.64it/s]
V Loss: 0.6744, count: 41: 100%|██████████| 63/63 [00:02<00:00, 24.78it/s]


Epoch: 41, Train Loss: 0.0354, Val Loss: 0.6744


T Loss: 0.0309, count: 42: 100%|██████████| 250/250 [00:29<00:00,  8.55it/s]
V Loss: 0.6716, count: 42: 100%|██████████| 63/63 [00:02<00:00, 25.32it/s]


Epoch: 42, Train Loss: 0.0309, Val Loss: 0.6716


T Loss: 0.0268, count: 43: 100%|██████████| 250/250 [00:29<00:00,  8.59it/s]
V Loss: 0.6792, count: 43: 100%|██████████| 63/63 [00:02<00:00, 24.28it/s]


Epoch: 43, Train Loss: 0.0268, Val Loss: 0.6792


T Loss: 0.0229, count: 44: 100%|██████████| 250/250 [00:28<00:00,  8.74it/s]
V Loss: 0.6598, count: 44: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch: 44, Train Loss: 0.0229, Val Loss: 0.6598


T Loss: 0.0202, count: 45: 100%|██████████| 250/250 [00:28<00:00,  8.65it/s]
V Loss: 0.6627, count: 45: 100%|██████████| 63/63 [00:02<00:00, 25.00it/s]


Epoch: 45, Train Loss: 0.0202, Val Loss: 0.6627


T Loss: 0.0176, count: 46: 100%|██████████| 250/250 [00:28<00:00,  8.62it/s]
V Loss: 0.6487, count: 46: 100%|██████████| 63/63 [00:02<00:00, 24.79it/s]


Epoch: 46, Train Loss: 0.0176, Val Loss: 0.6487


T Loss: 0.0174, count: 47: 100%|██████████| 250/250 [00:28<00:00,  8.71it/s]
V Loss: 0.6676, count: 47: 100%|██████████| 63/63 [00:02<00:00, 23.70it/s]


Epoch: 47, Train Loss: 0.0174, Val Loss: 0.6676


T Loss: 0.0163, count: 48: 100%|██████████| 250/250 [00:28<00:00,  8.68it/s]
V Loss: 0.6530, count: 48: 100%|██████████| 63/63 [00:02<00:00, 26.44it/s]


Epoch: 48, Train Loss: 0.0163, Val Loss: 0.6530


T Loss: 0.0126, count: 49: 100%|██████████| 250/250 [00:29<00:00,  8.60it/s]
V Loss: 0.6441, count: 49: 100%|██████████| 63/63 [00:02<00:00, 25.06it/s]


Epoch: 49, Train Loss: 0.0126, Val Loss: 0.6441


T Loss: 0.0108, count: 50: 100%|██████████| 250/250 [00:29<00:00,  8.59it/s]
V Loss: 0.6475, count: 50: 100%|██████████| 63/63 [00:02<00:00, 25.46it/s]

Epoch: 50, Train Loss: 0.0108, Val Loss: 0.6475



