## Actividad 2 NLP

Integrantes del equipo:
- David Guzmán Leyva - A01706417
- Enrique Santos Fraire - A01705746
- Leonardo Alvarado Menéndez - A01705998
- Oscar Enrique Delgadillo Ochoa - A01705935

Actividades:
1. Con base en el ejercicio realizado en la clase del 25 de Octubre, mejorar el modelo de red neuronal recurrente para alcanzar un accuracy >= 90%

In [1]:
import numpy as np
#PyTorch libraries
import torch
from torchtext.datasets import AG_NEWS
from torch import nn
from torch.nn import functional as F
# Dataset and dataloader
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
# Libraries to prepare the data
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

In [2]:
torch.cuda.is_available()

False

In [3]:
# Use Gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [4]:
train_dataset,  test_dataset = AG_NEWS()

In [5]:
train_dataset, test_dataset = to_map_style_dataset(train_dataset), to_map_style_dataset(test_dataset)

In [6]:
len(train_dataset), len(test_dataset)

(120000, 7600)

In [7]:
train_dataset[:10]

[(3,
  "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."),
 (3,
  'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'),
 (3,
  "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."),
 (3,
  'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.'),
 (3,
  'Oil prices soar to all-time record, 

In [8]:
tokeniser = get_tokenizer('basic_english')
def yield_tokens(data):
  for _, text in data:
    yield tokeniser(text)

In [9]:
vocab = build_vocab_from_iterator(yield_tokens(train_dataset), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [10]:
len(vocab)

95811

In [11]:
for i in range(20):
  print(i, vocab.lookup_token(i))

0 <unk>
1 .
2 the
3 ,
4 to
5 a
6 of
7 in
8 and
9 s
10 on
11 for
12 #39
13 (
14 )
15 -
16 '
17 that
18 with
19 as


In [12]:
tokens = tokeniser('Welcome to TE3007B')
print(tokens, vocab(tokens))

['welcome', 'to', 'te3007b'] [3314, 4, 0]


In [13]:
NUM_TRAIN = int(len(train_dataset) * 0.9)
NUM_VAL =len(train_dataset) - NUM_TRAIN

In [14]:
NUM_VAL

12000

In [15]:
train_dataset, val_dataset = random_split(train_dataset, [NUM_TRAIN, NUM_VAL])

In [16]:
len(train_dataset), len(val_dataset), len(test_dataset)

(108000, 12000, 7600)

In [17]:
max_tokens = 50

In [18]:
def collate_batch(batch):
  y, x = list(zip(*batch))
  # create list with tokens
  x = [vocab(tokeniser(text)) for text in x]
  # padding or clipping
  x = [t + ([0]*(max_tokens - len(t))) if len(t) < max_tokens else t[:max_tokens] for t in x]
  return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)-1

In [19]:
labels = ['World', 'Sports', 'Business', 'Sci/Tech']
BATCH_SIZE = 1024

In [20]:
# DataLoaders
train_loader = DataLoader(train_dataset, 
                          batch_size=BATCH_SIZE,
                          collate_fn=collate_batch,
                          shuffle = True)
val_loader = DataLoader(val_dataset, 
                          batch_size=BATCH_SIZE,
                          collate_fn=collate_batch,
                          shuffle = True)
test_loader = DataLoader(test_dataset, 
                          batch_size=BATCH_SIZE,
                          collate_fn=collate_batch,
                          shuffle = True)

In [21]:
for i, (x, y) in enumerate(test_loader):
  print(i, x.shape, y.shape)

0 torch.Size([1024, 50]) torch.Size([1024])
1 torch.Size([1024, 50]) torch.Size([1024])
2 torch.Size([1024, 50]) torch.Size([1024])
3 torch.Size([1024, 50]) torch.Size([1024])
4 torch.Size([1024, 50]) torch.Size([1024])
5 torch.Size([1024, 50]) torch.Size([1024])
6 torch.Size([1024, 50]) torch.Size([1024])
7 torch.Size([432, 50]) torch.Size([432])


In [22]:
EMBEDDING_SIZE = 200
NEURONS = 200 #hidden
LAYERS = 3
NUM_CLASSES = 4

### Let us build our RNN

In [23]:
class RNN_Model_1(nn.Module):
  def __init__(self, embed_size, hidden, layers, num_classes):
    super().__init__()
    self.embedding_layer = nn.Embedding(num_embeddings=len(vocab),
                                        embedding_dim=embed_size)
    
    self.rnn = nn.RNN(input_size=embed_size,
                      hidden_size = hidden,
                      num_layers = layers,
                      batch_first = True)

    self.fc = nn.Linear(in_features=hidden, out_features= num_classes)

  def forward(self, x):
    vector_embs = self.embedding_layer(x)
    y, h = self.rnn(vector_embs)
    return self.fc(y[:,-1])
    

### Let us build our LSTM

In [24]:
class LSTM_Model_1(nn.Module):
  def __init__(self, embed_size, hidden, layers, num_classes):
    super().__init__()
    self.embedding_layer = nn.Embedding(num_embeddings=len(vocab),
                                        embedding_dim=embed_size)
    
    self.lstm = nn.LSTM(input_size=embed_size,
                      hidden_size = hidden,
                      num_layers = layers,
                      batch_first = True)

    self.fc = nn.Linear(in_features=hidden, out_features= num_classes)

  def forward(self, x):
    vector_embs = self.embedding_layer(x)
    y, h = self.lstm(vector_embs)
    return self.fc(y[:,-1])

### Let us build our GRU

In [25]:
class GRU_Model_1(nn.Module):
  def __init__(self, embed_size, hidden, layers, num_classes):
    super().__init__()
    self.embedding_layer = nn.Embedding(num_embeddings=len(vocab),
                                        embedding_dim=embed_size)
    
    self.gru = nn.GRU(input_size=embed_size,
                      hidden_size = hidden,
                      num_layers = layers,
                      batch_first = True)

    self.fc = nn.Linear(in_features=hidden, out_features= num_classes)

  def forward(self, x):
    vector_embs = self.embedding_layer(x)
    y, h = self.gru(vector_embs)
    return self.fc(y[:,-1])

### Check models

In [26]:
rnn_model = RNN_Model_1(EMBEDDING_SIZE, NEURONS, LAYERS, NUM_CLASSES)
lstm_model = LSTM_Model_1(EMBEDDING_SIZE, NEURONS, LAYERS, NUM_CLASSES)
gru_model = GRU_Model_1(EMBEDDING_SIZE, NEURONS, LAYERS, NUM_CLASSES)

In [27]:
rnn_model

RNN_Model_1(
  (embedding_layer): Embedding(95811, 200)
  (rnn): RNN(200, 200, num_layers=3, batch_first=True)
  (fc): Linear(in_features=200, out_features=4, bias=True)
)

In [28]:
lstm_model

LSTM_Model_1(
  (embedding_layer): Embedding(95811, 200)
  (lstm): LSTM(200, 200, num_layers=3, batch_first=True)
  (fc): Linear(in_features=200, out_features=4, bias=True)
)

In [29]:
gru_model

GRU_Model_1(
  (embedding_layer): Embedding(95811, 200)
  (gru): GRU(200, 200, num_layers=3, batch_first=True)
  (fc): Linear(in_features=200, out_features=4, bias=True)
)

### Apply our model

In [30]:
def accuracy(model, loader):
    num_correct = 0
    num_total = 0
    model.eval()
    model = model.to(device=device)
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype = torch.long)
            y = y.to(device=device, dtype = torch.long)
            scores = model(x)
            _, preds = scores.max(dim=1)
            num_correct += (preds == y).sum()
            num_total += preds.size(0)
        acc = float(num_correct)/num_total
        return acc

In [31]:
def train(model, optimiser, epochs=100):
  model = model.to(device=device)
  for epoch in range(epochs):
    for i, (x, y) in enumerate(train_loader):
      model.train()
      x = x.to(device=device, dtype=torch.long)
      y = y.to(device=device, dtype=torch.long)
      # run model
      scores = model(x)
      # compute cost
      cost = F.cross_entropy(input=scores, target=y)
      # reset gradient
      optimiser.zero_grad()
      # compute gradient
      cost.backward()
      # update parameter
      optimiser.step()
    acc = accuracy(model, val_loader)
    print(f'Epoch {epoch}, costo {cost.item():.4f}, val acc {acc:.4f}')


In [32]:
epochs = 10
lr = 0.0005

optimiser = torch.optim.Adam(gru_model.parameters(), lr=lr)

In [33]:
train(gru_model, optimiser, epochs)

Epoch 0, costo 0.6248, val acc 0.8049
Epoch 1, costo 0.4216, val acc 0.8684
Epoch 2, costo 0.3070, val acc 0.8848
Epoch 3, costo 0.3011, val acc 0.8942
Epoch 4, costo 0.1583, val acc 0.8968
Epoch 5, costo 0.1454, val acc 0.9024
Epoch 6, costo 0.1213, val acc 0.9057
Epoch 7, costo 0.1382, val acc 0.9017
Epoch 8, costo 0.1461, val acc 0.8996
Epoch 9, costo 0.0853, val acc 0.8977


In [34]:
print(f'{accuracy(gru_model, test_loader):.4f}')

0.8996
