<a href="https://colab.research.google.com/github/Shrey-Viradiya/PyTorch_for_DL/blob/master/PyTorch_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTorch_Text Classification

In [1]:
!nvidia-smi

Fri Oct  2 17:29:19 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
!unzip -n trainingandtestdata.zip

URL transformed to HTTPS due to an HSTS policy
--2020-10-02 17:29:20--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip.2’


2020-10-02 17:29:21 (75.1 MB/s) - ‘trainingandtestdata.zip.2’ saved [81363704/81363704]

Archive:  trainingandtestdata.zip


In [3]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torchtext import data 
import torchtext
from pathlib import Path
import pandas as pd
import spacy

## Loading & Data Cleaning

In [4]:
tweetsDF = pd.read_csv("training.1600000.processed.noemoticon.csv", engine="python", header=None)

In [5]:
tweetsDF[0].value_counts()

4    800000
0    800000
Name: 0, dtype: int64

In [6]:
tweetsDF['sentiment_cat'] = tweetsDF[0].astype('category')
tweetsDF["sentiment"] = tweetsDF["sentiment_cat"].cat.codes
tweetsDF.to_csv("train-processed.csv", header=None, index=None)      
tweetsDF.sample(10000).to_csv("train-processed-sample.csv", header=None, index=None)

In [7]:
LABEL = data.LabelField()
TWEET = data.Field(tokenize='spacy', lower=True)

In [8]:
 fields = [('score',None), ('id',None),('date',None),('query',None),
      ('name',None),
      ('tweet', TWEET),('category',None),('label',LABEL)]

## Create our Dataset and DataLoaders

In [9]:
twitterDataset = torchtext.data.TabularDataset(
    path = 'train-processed-sample.csv',
    format = 'CSV',
    fields = fields,
    skip_header = False
)

In [10]:
(train, test, valid)=twitterDataset.split(split_ratio=[0.6,0.2,0.2],stratified=True, strata_field='label')

(len(train),len(test),len(valid))

(6000, 2000, 2000)

In [11]:
vars(train.examples[7])

{'label': '0',
 'tweet': ['dude',
  '!',
  'you',
  'mean',
  'nothing',
  'to',
  'me',
  'now',
  'but',
  'it',
  'does',
  'nt',
  'seem',
  'like',
  'it']}

In [12]:
vocab_size = 20000
TWEET.build_vocab(train, max_size = vocab_size)
LABEL.build_vocab(train)
TWEET.vocab.freqs.most_common(10)

[('i', 3757),
 ('!', 3476),
 ('.', 3037),
 ('to', 2220),
 (' ', 2160),
 ('the', 1927),
 (',', 1800),
 ('a', 1506),
 ('my', 1251),
 ('it', 1139)]

In [13]:
device = "cuda"

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train, valid, test), 
batch_size = 32,
device = device,
sort_key = lambda x: len(x.tweet),
sort_within_batch = False)

## Our First LSTM

In [14]:
class OurFirstLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(OurFirstLSTM, self).__init__()
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim,  
                hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2)

    def forward(self, seq):
        output, (hidden,_) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds

model = OurFirstLSTM(100,300, 20002)
model.to(device)

OurFirstLSTM(
  (embedding): Embedding(20002, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

In [15]:
def GPUinfo():
    import os
    import sys
    a  = os.popen('nvidia-smi').readlines()
    mem = a[9][33:54]
    util = a[9][58:66]
    sys.stdout.write(f"\rGPU Memory: {mem} Utilization: {util}")
    sys.stdout.flush()

In [16]:
import time

optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
    for epoch in range(1, epochs + 1):
        start = time.time()
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch_idx, batch in enumerate(train_iterator):
            optimizer.zero_grad()
            GPUinfo()
            predict = model(batch.tweet)
            GPUinfo()
            loss = criterion(predict,batch.label)
            GPUinfo()
            loss.backward()
            GPUinfo()
            optimizer.step()
            GPUinfo()
            training_loss += loss.data.item() * batch.tweet.size(0)
        training_loss /= len(train_iterator)
 
        
        model.eval()
        for batch_idx,batch in enumerate(valid_iterator):
            predict = model(batch.tweet)
            GPUinfo()
            loss = criterion(predict,batch.label)
            GPUinfo()
            valid_loss += loss.data.item() * batch.tweet.size(0)
 
        valid_loss /= len(valid_iterator)
        print('\nEpoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, time: {:.3f}'.format(epoch, training_loss, valid_loss, time.time() - start))

In [17]:
train(5, model, optimizer, criterion, train_iterator, valid_iterator)

GPU Memory:    1081MiB / 15079MiB Utilization:     0%  
Epoch: 1, Training Loss: 23.43, Validation Loss: 12.52, time: 160.796
GPU Memory:    1081MiB / 15079MiB Utilization:     0%  
Epoch: 2, Training Loss: 20.58, Validation Loss: 12.71, time: 159.150
GPU Memory:    1081MiB / 15079MiB Utilization:     0%  
Epoch: 3, Training Loss: 17.65, Validation Loss: 14.75, time: 156.302
GPU Memory:    1081MiB / 15079MiB Utilization:     0%  
Epoch: 4, Training Loss: 15.78, Validation Loss: 14.44, time: 154.167
GPU Memory:    1081MiB / 15079MiB Utilization:     0%  
Epoch: 5, Training Loss: 14.44, Validation Loss: 15.37, time: 152.640


## Making predictions

In [18]:
def classify_tweet(tweet):
    categories = {0: "Negative", 1:"Positive"}
    processed = TWEET.process([TWEET.preprocess(tweet)])
    processed = processed.to(device)
    return categories[model(processed).argmax().item()]

## Data Augmentation

In [19]:
def random_deletion(words, p=0.5):
    if len(words) == 1:
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    if len(remaining) == 0:
        return [random.choice(words)]
    else:
        return remaining

In [20]:
def random_swap(sentence, n=5):
    length = range(len(sentence))
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

In [21]:
# Note: you'll have to define remove_stopwords() and get_synonyms()

def remove_stopwords():
    pass

def get_synonyms():
    pass

def random_insertion(sentence,n):
    words = remove_stopwords(sentence)
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym)
    return sentence

In [22]:
import googletrans
import random

translator = googletrans.Translator()

sentences = ['The cat sat on the mat']

translations_fr = translator.translate(sentences, dest='fr')
fr_text = [t.text for t in translations_fr] 
translations_en = translator.translate(fr_text, dest='en')
en_text = [t.text for t in translations_en]
print(en_text)   

available_langs = list(googletrans.LANGUAGES.keys())
tr_lang = random.choice(available_langs)
print(f"Translating to {googletrans.LANGUAGES[tr_lang]}")

translations = translator.translate(sentences, dest=tr_lang)
t_text = [t.text for t in translations]
print(t_text)

translations_en_random = translator.translate(t_text, src=tr_lang, dest='en')
en_text = [t.text for t in translations_en_random]
print(en_text)

['The cat sat on the carpet']
Translating to romanian
['Pisica stătea pe saltea']
['The cat was sitting on the mattress']
