### RNN

> Train a simple RNN

### Importing Libs

In [13]:
import torch
import re
import torch.nn as nn
import numpy as np
import pandas as pd
from datasets import load_dataset
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from tqdm import tqdm

### Loading Dataset

In [2]:
dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [3]:
train = dataset['train']
test = dataset['test']

train, test

(Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }))

In [8]:
train[10]

{'text': 'It was great to see some of my favorite stars of 30 years ago including John Ritter, Ben Gazarra and Audrey Hepburn. They looked quite wonderful. But that was it. They were not given any characters or good lines to work with. I neither understood or cared what the characters were doing.<br /><br />Some of the smaller female roles were fine, Patty Henson and Colleen Camp were quite competent and confident in their small sidekick parts. They showed some talent and it is sad they didn\'t go on to star in more and better films. Sadly, I didn\'t think Dorothy Stratten got a chance to act in this her only important film role.<br /><br />The film appears to have some fans, and I was very open-minded when I started watching it. I am a big Peter Bogdanovich fan and I enjoyed his last movie, "Cat\'s Meow" and all his early ones from "Targets" to "Nickleodeon". So, it really surprised me that I was barely able to keep awake watching this one.<br /><br />It is ironic that this movie is a

### Data Preprocessing

In [10]:
def tokenize(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower().split()

In [11]:
from collections import Counter

vocab_size = 10000
counter = Counter()

for example in train:
    counter.update(tokenize(example['text']))

# Keep most common words
vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.most_common(vocab_size))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode_sentence(text):
    return [vocab.get(word, vocab['<UNK>']) for word in tokenize(text)]

In [14]:
class IMDBDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['label']
        encoded = torch.tensor(encode_sentence(text), dtype=torch.long)
        return encoded, torch.tensor(label, dtype=torch.long)

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts_padded = pad_sequence(texts, padding_value=vocab['<PAD>'], batch_first=True)
    return texts_padded, torch.tensor(labels)

train_dataset = IMDBDataset(train)
test_dataset = IMDBDataset(test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
