# Introduction to PyTorch
- Learn about PyTorch with TOConnect 2021
- Work with NLP, in the Yelp Polarity Dataset

# Mount the Google Drive

In [None]:
from google.colab import drive 
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
%%capture
!pip install transformers
import transformers

Import Dependencies 

In [None]:
# ----------TORCH IMPORTS--------------
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext

# ---------DATA SCIENCE IMPORTS-----------
import numpy as np
import pandas as pd 
import nltk
from tqdm.notebook import tqdm

# NLTK
nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
train, test = torchtext.datasets.YelpReviewPolarity(root = '.data', split = ('train', 'test'))

In [None]:
# Grab all of the Data in the Two Splits(Train + Test)
Test_X = [] 
Test_Y = []
Train_X = []
Train_Y = [] 

for y, x in train:
  Train_X.append(x)
  Train_Y.append(y)
for y, x in test:
  Test_X.append(x)
  Test_Y.append(y)

TRAIN_SIZE = 10000 # For Faster Training(but worse performance), 
Train_X = Train_X[:TRAIN_SIZE]
Train_Y = Train_Y[:TRAIN_SIZE]
Test_X = Test_X[:TRAIN_SIZE]
Test_Y = Test_Y[:TRAIN_SIZE]


In [None]:
import string

In [None]:
# Grab all Unique Words
TOKENIZED_TRAIN_X = [] 
TOKENIZED_TEST_X = [] 

count = 0


for x in tqdm(Train_X): 
  x = x.translate(str.maketrans(string.punctuation," " * len(string.punctuation)))
  TOKENIZED_TRAIN_X.append(nltk.word_tokenize(x))
for x in tqdm(Test_X):
  x = x.translate(str.maketrans(string.punctuation," " * len(string.punctuation)))
  TOKENIZED_TEST_X.append(nltk.word_tokenize(x))

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
# LOAD IN THE VOCAB
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
WORD2IDX = tokenizer.vocab
IDX2WORD = {WORD2IDX[word]: word for word in WORD2IDX} # Grab Uncased Vocabulary 
PAD_TOKEN = tokenizer.pad_token_id
UNK_TOKEN = tokenizer.unk_token_id

In [None]:
# Encode all Values 
ENCODED_TRAIN_X = [] 
ENCODED_TEST_X = [] 
MAX_LEN = 64# Approx Max we will accept 

for sent in TOKENIZED_TRAIN_X:
  new_sent = [] 
  for word in sent: 
    word = word.lower()
    if word not in WORD2IDX:
      new_sent.append(UNK_TOKEN)
    else:
      new_sent.append(WORD2IDX[word])
  
  
  new_sent = new_sent[:MAX_LEN]
  # Pad up to max length 
  padded_seq = [PAD_TOKEN] * MAX_LEN
  padded_seq[:len(new_sent)] = new_sent 

  
  
  ENCODED_TRAIN_X.append(padded_seq)

for sent in TOKENIZED_TEST_X:
  new_sent = [] 
  for word in sent:
    word = word.lower()
    if word not in WORD2IDX:
      new_sent.append(UNK_TOKEN)
    else:
      new_sent.append(WORD2IDX[word])
  new_sent = new_sent[:MAX_LEN]
  padded_sent = [PAD_TOKEN] * MAX_LEN
  padded_sent[:len(new_sent)] = new_sent

  ENCODED_TEST_X.append(padded_sent)

In [None]:
ARRAY_TRAIN_X = np.array(ENCODED_TRAIN_X)
ARRAY_TEST_X = np.array(ENCODED_TEST_X)

# Create a Dataset Object 

In [None]:
class Dataset(torch.utils.data.Dataset):
  
  
  def __init__(self, tokenized_x, y):
    self.X = tokenized_x
    self.y = y


  def __len__(self):
    return len(self.X) 

    
  def __getitem__(self, idx):

    x = self.X[idx]
    y = self.y[idx] - 1
    
  
    return np.array(x), np.array(y)
  

In [None]:
# Create DataLoaders
BATCH_SIZE = 64

def get_dataloader(num_examples, batch_size):
  test_dataset = Dataset(ARRAY_TRAIN_X[:num_examples], Train_Y[:num_examples])
  train_dataset = Dataset(ARRAY_TRAIN_X[:num_examples], Train_Y[:num_examples])
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size, shuffle = True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle = False)
  return train_dataloader, test_dataloader
train_dataloader, test_dataloader = get_dataloader(TRAIN_SIZE, 128)

# Create a Custom Model(Model 1 -> RNN/LSTM Model):

In [None]:
class CustomRNN(nn.Module):
  def __init__(self, vocab_len, padding_idx ):
    super().__init__()




    self.padding_idx = padding_idx
    self.vocab_len = vocab_len
    self.embedding_dim = 50
    self.hidden_dim = 16
    self.num_layers = 1





    self.embedding_layer = nn.Embedding(self.vocab_len, self.embedding_dim, padding_idx = self.padding_idx)
    
    
    self.RNN = nn.LSTM(self.embedding_dim, self.hidden_dim, self.num_layers, batch_first = True, bidirectional = True)
    
    
    self.classifier = nn.Linear(self.hidden_dim * 2 * self.num_layers, 1)
  def forward(self, x):
    embedding = self.embedding_layer.forward(x)
    _, (hidden, cell) = self.RNN.forward(embedding)  

    # hidden - The final state we use 
    # hidden: Tensor(Num layers, N, Dimensional) 
    hidden = hidden.transpose(0, 1)
    B = hidden.shape[0]

    hidden = hidden.reshape(B, -1)
    # classifier head
    logits = self.classifier(hidden)
    logits = logits.reshape(-1, )
    return logits

In [None]:
class Trainer:
  '''
  Trainer class to Train a binary classifier 
  '''
  def __init__(self, model):
    self.device = device
    self.model = model.to(self.device)


    self.optimizer = optim.Adam(self.model.parameters(), lr = 5e-4)

    self.loss_function = nn.BCEWithLogitsLoss() 
  
  def save(self):
    # saves a model with the trainers
    torch.save(self.model.state_dict(), './model.pth')

  def training_step(self, x, y):
    self.model.train()
    self.optimizer.zero_grad()
    outputs = self.model.forward(x) 

    loss = self.loss_function(outputs, y)
    
    loss.backward()
    self.optimizer.step()
  def evaluation_step(self, x, y):
    self.model.eval()
    with torch.no_grad():
      output = self.model(x)
    loss = self.loss_function(output, y) 

    output = torch.sigmoid(output)


    output = torch.round(output)

    
    accuracy = output == y
    tp = torch.sum(accuracy)
    all = accuracy.reshape(-1).shape[0]
    return loss, tp / all
  def train_model(self, train_dataloader):
    for x, y in tqdm(train_dataloader):
      x = x.long().to(self.device)
      y = y.float().to(self.device)
      self.training_step(x, y)
      


  def evaluate_model(self, eval_dataloader):
    
    sum_loss = 0
    sum_accuracy = 0
    count = 0
    for x, y in tqdm(eval_dataloader):
      x = x.long().to(self.device)
      y = y.float().to(self.device)
      loss, acc = self.evaluation_step(x, y) 
      sum_loss = sum_loss + loss
      sum_accuracy = sum_accuracy + acc
      count += 1

    sum_loss = sum_loss / count
    sum_accuracy = sum_accuracy / count
    print(sum_loss, sum_accuracy) 
  def train_whole_model(self, num_epochs, train_dataloader, eval_dataloader):
    for epoch in range(num_epochs):
      print("--------TRAINING---------")
      self.train_model(train_dataloader)
      print("--------EVALUATION-------")
      self.evaluate_model(eval_dataloader)
      
      

In [None]:
trainer = Trainer(CustomRNN(len(WORD2IDX), len(WORD2IDX) - 1))

In [None]:
trainer.train_whole_model(50, train_dataloader, train_dataloader)

In [None]:
trainer.save()