<a href="https://colab.research.google.com/github/VAIBHAV2900/DLSS_6_Vision/blob/main/Session_7_NLP/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning for Natural Language Processing
Summer School 2021 \
Week 3 Session 2

---


In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Tokenizer

During processing, spaCy first tokenizes the text, i.e. segments it into words, punctuation and so on. This is done by applying rules specific to each language.

![Tokenizer](https://spacy.io/tokenization-9b27c0f6fe98dcb26239eba4d3ba1f3d.svg)

In [None]:
from spacy.lang.en import English

nlp = English()
tokenizer = nlp.tokenizer

sentence = "Try any sentence of your wish or don't!"

print( len(tokenizer(sentence)) )
print( tokenizer.explain(sentence) )

10
[('TOKEN', 'Try'), ('TOKEN', 'any'), ('TOKEN', 'sentence'), ('TOKEN', 'of'), ('TOKEN', 'your'), ('TOKEN', 'wish'), ('TOKEN', 'or'), ('SPECIAL-1', 'do'), ('SPECIAL-2', "n't"), ('SUFFIX', '!')]


## Word Representation
### GloVe Embeddings
GloVe exploits the overall co occurrence statistics of the word corpus.
* The resulting representations showcase interesting linear substructures of the word vector space.

    - The Euclidean distance between two word vectors provides an effective method for measuring the linguistic or semantic similarity of the corresponding words.

[Download the pretrained weights.](https://nlp.stanford.edu/projects/glove/)

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!mkdir embeddings
!mkdir embeddings/glove.6B
!unzip "/content/glove.6B.zip" -d "/content/embeddings/glove.6B"

--2021-07-22 07:33:14--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-07-22 07:33:15--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-07-22 07:35:56 (5.10 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  /content/glove.6B.zip
  inflating: /content/embeddings/glove.6B/glove.6B.50d.txt  
  inflating: /content/embeddings/glove.6B/glove.6B.100d.txt  
  inflating: /content/embeddings/glov

In [None]:
embeddings = {}
with open("/content/embeddings/glove.6B/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = " ".join(t for t in values[:-50])
        vector = np.asarray(values[-50:], "float64")
        embeddings[word] = vector

In [None]:
from scipy import spatial

def find_closest_word(word):
    embedding = embeddings[word]
    return sorted(embeddings.keys(), key=lambda target: spatial.distance.euclidean(embeddings[target], embedding))

closest = find_closest_word("king")
print(closest[:4])

['king', 'prince', 'queen', 'uncle']


## Recurrent Neural Network

![Architecture](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-SimpleRNN.png)

RNNs are a type of Neural Network architecture, where the output from the previous step is fed additionally as input the current step.

In [None]:
import torch
import torch.nn as nn

import os

### From Scratch

In [None]:
class RNN_scratch(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.tanh = nn.Tanh()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)
        hidden = self.i2h(combined)
        hidden = self.tanh(hidden)
        output = self.i2o(combined)
        output = self.tanh(output)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

## Implementation

### Preprocessing

In [None]:
!gdown --id 1KAScc_mAbz5soyxeMVWvzy036lkwHY0F
!unzip "/content/embeddings.zip" -d "/content"

Downloading...
From: https://drive.google.com/uc?id=1KAScc_mAbz5soyxeMVWvzy036lkwHY0F
To: /content/embeddings.zip
2.18GB [00:14, 151MB/s]
Archive:  /content/embeddings.zip
   creating: /content/embeddings/glove.840B.300d/
  inflating: /content/embeddings/glove.840B.300d/glove.840B.300d.txt  


In [None]:
!gdown --id 1CalhcgxPu00vKQAHNba_8zPcsNrLExXx

Downloading...
From: https://drive.google.com/uc?id=1CalhcgxPu00vKQAHNba_8zPcsNrLExXx
To: /content/train.csv
124MB [00:01, 90.9MB/s]


In [None]:
PATH = %pwd
print(PATH)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

/content


In [None]:
main_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
print(main_df.shape)
main_df = main_df.sample(n=main_df.shape[0])
main_df = main_df[["question_text", "target"]]
main_df.head()

(1306122, 3)


Unnamed: 0,question_text,target
629935,Why is colloidal oatmeal beneficial in skincare?,0
780706,The magnitude of electric fields in an electro...,0
423082,How can a god of kama be so vital at this stag...,0
1134560,How mass media portray nihilism?,0
108481,What will happen to a LDS Apostle if he is a s...,0


In [None]:
o_class = main_df.loc[main_df.target == 0, :]
print(len(o_class))
l_class = main_df.loc[main_df.target == 1, :]
print(len(l_class))

1225312
80810


In [None]:
test_o = o_class.iloc[:1024, :]
test_l = l_class.iloc[:1024, :]

valid_o = o_class.iloc[1024:32768, :]
valid_l = l_class.iloc[1024:32768, :]

train_o = o_class.iloc[32768:, :]
train_l = l_class.iloc[32768:, :]

In [None]:
train = pd.concat([train_o, train_l], axis=0)
print(train.shape)

valid = pd.concat([valid_o, valid_l], axis=0)
print(valid.shape)

test = pd.concat([test_o, test_l], axis=0)
print(test.shape)

(1240586, 2)
(63488, 2)
(2048, 2)


In [None]:
!mkdir inputs

train.to_csv(os.path.join(PATH, "inputs/train.csv"), index=False)
test.to_csv(os.path.join(PATH, "inputs/test.csv"), index=False)
valid.to_csv(os.path.join(PATH, "inputs/valid.csv"), index=False)

In [None]:
del main_df, train, test, valid, train_l, train_o, test_l, test_o, valid_l,valid_o, o_class, l_class

### Dataloader

In [None]:
import torch
import os
import spacy
import nltk
import torchtext

class CreateDataset(torch.utils.data.Dataset):

    def __init__(self, PATH, batch_size=32):
        self.PATH = PATH
        self.batch_size = batch_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.spacy = spacy.load("en_core_web_sm")

        self.TEXT = torchtext.legacy.data.Field(sequential=True, tokenize="spacy")
        self.LABEL = torchtext.legacy.data.LabelField(dtype=torch.long, sequential=False)

        self.initData()
        self.initEmbed()

        self.makeData()

    def initData(self):
        DATA = os.path.join(self.PATH, 'inputs/')

        self.train_data, self.valid_data, self.test_data = torchtext.legacy.data.TabularDataset.splits(
                        path=DATA, 
                        train="train.csv", validation="valid.csv", test="test.csv", 
                        format="csv", 
                        skip_header=True, 
                        fields=[('Text', self.TEXT), ('Label', self.LABEL)])

    def initEmbed(self):
        EMBED = os.path.join(self.PATH, "embeddings/glove.840B.300d/glove.840B.300d.txt")

        self.TEXT.build_vocab(self.train_data,
                         vectors=torchtext.vocab.Vectors(EMBED), 
                         max_size=20000, 
                         min_freq=10)
        self.LABEL.build_vocab(self.train_data)

    def makeData(self):
        self.train_iterator, self.valid_iterator, self.test_iterator = torchtext.legacy.data.BucketIterator.splits(
                        (self.train_data, self.valid_data, self.test_data), 
                        sort_key=lambda x: len(x.Text), 
                        batch_size=self.batch_size,
                        device=self.device)

    def lengthData(self):
        return len(self.train_data), len(self.valid_data), len(self.test_data)
    
    def lengthVocab(self):
        return len(self.TEXT.vocab), len(self.LABEL.vocab)

    def freqLABEL(self):
        return self.LABEL.vocab.freqs

    def getData(self):
        return self.train_iterator, self.valid_iterator, self.test_iterator

    def getEmbeddings(self):
        return self.TEXT.vocab.vectors

### Model
Using pytorch implementation

In [None]:
import torch

class RNN(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim)
        self.linear = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        out = self.linear(hidden)
        return out

### Training

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install pyprind

In [None]:
dataset = CreateDataset(PATH)

In [None]:
train_iterator, valid_iterator, test_iterator = dataset.getData()
pretrained_embeddings = dataset.getEmbeddings()

input_dim = dataset.lengthVocab()[0]
embedding_dim = 300
hidden_dim = 374
output_dim = 2
num_layers = 2
batch_size = 32

In [None]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

model.embedding.weight.data = pretrained_embeddings.to(device)
class_weights = torch.tensor([1.0, 15.0]).to(device)

In [None]:
import torch.optim as optim
import torch.nn as nn

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
DRIVE = "/content/drive/MyDrive/Projects/Clubs/Analytics/Coordinator/Summer School" # Use your drive path

In [None]:
import torch.nn.functional as F

def binary_accuracy(preds, y):

    preds, ind= torch.max(F.softmax(preds, dim=-1), 1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

In [None]:
epoch_train_losses = []
epoch_test_losses = []
epoch_val_losses = []  
accu_train_epoch = []
accu_test_epoch = []
accu_val_epoch = []

In [None]:
import pyprind

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    train_loss_batch = []
    accu_train_batch = []

    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.Text).squeeze(0)

        loss = criterion(predictions, batch.Label)

        acc = binary_accuracy(predictions, batch.Label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        train_loss_batch.append(loss)
        accu_train_batch.append(acc)

        bar.update()

    epoch_train_losses.append(sum(train_loss_batch)/len(iterator))
    accu_train_epoch.append(sum(accu_train_batch)/len(iterator))

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()

    val_loss_batch = []
    accu_val_batch = []
    
    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for batch in iterator:

            predictions = model(batch.Text).squeeze(0)
            
            loss = criterion(predictions, batch.Label)
            
            acc = binary_accuracy(predictions, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            val_loss_batch.append(loss)
            accu_val_batch.append(acc)

            bar.update()

        epoch_val_losses.append(sum(val_loss_batch)/len(iterator))
        accu_val_epoch.append(sum(accu_val_batch)/len(iterator))
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import gc
epochs = 2

for epoch in range(epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_val_losses[-1],
            }, os.path.join(DRIVE, 'Quora.pt'))
    gc.collect()

### Results

In [None]:
plt.plot(epoch_train_losses, label='Training Loss')
plt.plot(epoch_val_losses, label='Validation Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(accu_train_epoch, label='Accuracy Training')
plt.plot(accu_val_epoch, label='Accuracy Validation')
plt.legend()
plt.show()

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print("\nTest Loss:", test_loss)
print("\nTest Accuracy:", test_acc)