In [1]:
import torch
import torch.nn as nn
import pandas as pd
import re
import string
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader, random_split

## Data preprocessing

In [2]:
data = pd.read_csv("../Autoregressive models/nlp_comments/CommentsJan2017.csv")
data = data["commentBody"].values
print(f"Number of training samples: {len(data)}")

  data = pd.read_csv("../Autoregressive models/nlp_comments/CommentsJan2017.csv")


Number of training samples: 231449


In [3]:
def clean_text(text):
    text = text.replace('<br/>', ' ')
    text = text.replace('&amp', '')
    text = text.replace("\"", '')
    text = "".join(v for v in text if v not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii",'ignore')
    return text

In [4]:
data = np.vectorize(clean_text)(data)

## Vocab class

In [6]:
PAD = 0
START = 1
END = 2
UNK = 3

In [42]:
class Vocab:
    def __init__(self, text_array, freq=3):
        self.text_array = text_array
        self.ttov = {"": PAD, "": START,
                     "": END, "": UNK}
        self.vtot = dict((idx, token) for token, idx in self.ttov.items())
        self.length = 4
        self.counter = Counter()
        self.freq = freq
        self.build_vocab()

    def add(self, token):
        self.ttov[token] = self.length
        self.vtot[self.length] = token
        self.length += 1

    def build_vocab(self):
        for sentence in self.text_array:
            for token in sentence.split():
                self.counter.update([token])

        for token, counter in self.counter.items():
            if counter >= self.freq:
                self.add(token)

    def vecToText(self, vec):
        text = []
        for v in vec:
            if v in self.vtot:
                text.append(self.vtot[v])
            else:
                text.append("")
        return text

    def textToVec(self, text):
        vec = []
        for t in text:
            if t in self.ttov:
                vec.append(self.ttov[t])
            else:
                vec.append(UNK)
        return torch.tensor(vec)

In [43]:
vocab = Vocab(data)
print(f"Length of dictionary: {vocab.length}")

Length of dictionary: 55446


## Dataset/Dataloader class

In [44]:
class TextDataset(Dataset):
    def __init__(self, text, vocab):
        self.text = text
        self.vocab = vocab

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        return self.vocab.textToVec(text)

In [45]:
def collate_fn(batch):
    text = [data for data in batch]
    max_len = max([len(data) for data in batch]) + 1
    text_in = [torch.cat([torch.tensor([START]), txt]) for txt in text]
    text_out = [torch.cat([txt, torch.tensor([END])]) for txt in text]
    return pad(text_in, max_len), pad(text_out, max_len)

def pad(texts, max_len):
    padded = []
    for text in texts:
        while len(text) < max_len:
            text = torch.cat([text, torch.tensor([PAD])])
        padded.append(text)
    return torch.stack(padded)

## CharRNN class

In [46]:
class CharRNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        """
        param vocab_size (V): number of vocab
        param embedding_dim (E): number of embedding dimension
        param hidden_dim (H): number of hidden dimension
        
        length (L): length of sentence
        batch (B): batch size
        """
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.2)
        self.linear = torch.nn.Linear(in_features=hidden_dim, out_features=vocab_size)
        
    def forward(self, X):
        """
        Input shape: [B, L]
        Embedding: [B, L, E]
        RNN: [B, L, H]
        Out: [B, L, V]
        """
        X = self.embedding(X)
        X, _ = self.rnn(X)
        out = self.linear(X)
        return out
    
    def generate(self, max_length=100):
        """
        sentence: stores a collection of characters
        character: store the index (integer) of current character
        """
        sentence = [START]
        character = START
        hidden = None
        with torch.no_grad():
            while len(sentence) <= max_length:
                X = torch.tensor([[character]], dtype=torch.long)
                X = self.embedding(X)
                X, hidden = self.rnn(X, hidden)
                X = self.linear(X)
                character = torch.argmax(X, axis=-1).unsqueeze(0).unsqueeze(0).item()
                sentence.append(character)
                if character == END:
                    break
        return torch.tensor(sentence)

In [164]:
# Test CharRNN class
batch_size = 64
length = 100
vocab_size = 100
embedding_dim = 256
hidden_dim = 512
charrnn = CharRNN(vocab_size, embedding_dim, hidden_dim)
X = torch.randint(high=vocab_size, size=(batch_size, length))
out = charrnn(X)

# Forward test
if out.shape == (batch_size, length, vocab_size):
    print(f"Passed forward test!")
else:
    print(f"Test failed, the outputted shape is {out.shape}")
    
# Generate test
sample = charrnn.generate()
if len(sample.shape) == 1:
    print(f"Passed generate test!")
else:
    print(f"Test failed, the sample setence shape is {sample.shape}")

Passed forward test!
Passed generate test!


## Training loop

In [47]:
def train(model, train_loader, n_epochs, lr=1e-3, device="cpu"):
    model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD, reduction='mean')
    for epoch in range(n_epochs):
        average_loss = 0
        n = 0
        for X, y in train_loader:
            optimizer.zero_grad()
            y_pred = model(X)
            loss = criterion(y_pred.view(-1, vocab.length), y.view(-1))
            loss.backward()
            optimizer.step()
            average_loss += loss.item()
            n += 1
        print(f"Epoch {epoch + 1} average loss: {average_loss / n}")

## Training

In [48]:
dataset = TextDataset(data[:64], vocab)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [49]:
model = CharRNN(vocab.length, 128, 256)

In [51]:
train(model, dataloader, 10)

Epoch 1 average loss: 3.2414730489254


KeyboardInterrupt: 

## Testing

In [55]:
model.generate()

tensor([1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3])