# Bidirectional LSTM Classifier

In [80]:
import torch
import torch.nn as nn
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from torchtext.vocab import Vectors

In [47]:
tokenizer = lambda words: words.split()

Review = Field(sequential=True, tokenize=tokenizer, lower=True)
Label = Field(sequential=False, use_vocab=False)

In [51]:
train_datafields = [
    ("id", None),
    ("content", Review),
    ("Business", Label),
    ("SciTech", Label),
    ("Sports", Label),
    ("World", Label)]

train, valid = TabularDataset.splits(
    path='/Users/nikolavetnic/Desktop/Text Materials/DeepLearning/[AI] Jibin Mathew - PyTorch Artificial Intelligence Fundamentals (2020)/Chapter 4/',
    train='train.csv',
    validation='valid.csv',     # field name is VALIDATION, not VALID
    format='csv',
    skip_header=True,
    fields=train_datafields)

In [53]:
test_datafields = [
    ("id", None),
    ("content", Review)]

test = TabularDataset.splits(
    path='/Users/nikolavetnic/Desktop/Text Materials/DeepLearning/[AI] Jibin Mathew - PyTorch Artificial Intelligence Fundamentals (2020)/Chapter 4/',
    format='csv',
    skip_header=True,
    fields=test_datafields)

In [54]:
Review.build_vocab(train, min_freq=2)

In [56]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    device=device,
    sort_key=lambda x: len(x.comment_text),
    sort_within_batch=False)



In [61]:
vec = Vectors(
    'glove.6B.100d.txt',
    cache='./bec/glove_embedding/',
    url='http://nlp.stanford.edu/data/glove.6B.zip')

./bec/glove_embedding/glove.6B.zip: 862MB [06:55, 2.08MB/s]                                
100%|█████████▉| 399449/400000 [00:40<00:00, 13882.37it/s]

In [77]:
class BiLSTMClassifier(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, num_layers):
        super().__init__()
        self.ebedding = nn.Embedding(len(Review.vocab), embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.rnn(x)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))

In [78]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5
NUM_LAYERS = 2

In [79]:
model = BiLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, NUM_LAYERS)