## Requirements 

In [1]:
#!pip install torch
#!pip install torchtext==0.5.0
#%matplotlib inline


Text Classification
=================


Load data with ngrams
---------------------

A bag of ngrams feature is applied to capture some partial information
about the local word order. In practice, bi-gram or tri-gram are applied
to provide more benefits as word groups than only one word. An example:

::

   "load data with ngrams"
   Bi-grams results: "load data", "data with", "with ngrams"
   Tri-grams results: "load data with", "data with ngrams"

``TextClassification`` Dataset supports the ngrams method. By setting
ngrams to 2, the example text in the dataset will be a list of single
words plus bi-grams string.




In [1]:
import torch
import torchtext

import pandas as pd

import time
from torch.utils.data.dataset import random_split

import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

from torchcontrib.optim import SWA

#from torchtext.datasets import text_classification
NGRAMS = 1
import os
if not os.path.isdir('./.data_sentiment'):
    os.mkdir('./.data_sentiment')

#DBpedia
train_dataset, test_dataset = torchtext.datasets.YelpReviewFull(
    root='./.data_sentiment', ngrams=NGRAMS, vocab=None)

#train_dataset, test_dataset = torchtext.datasets.AG_NEWS(
#    root='./.data_sentiment', ngrams=NGRAMS, vocab=None)

BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

650000lines [01:21, 8015.15lines/s]
650000lines [02:30, 4328.20lines/s]
50000lines [00:11, 4170.13lines/s]


Define the model
----------------

In [2]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

Initiate an instance
--------------------

The AG_NEWS dataset has four labels and therefore the number of classes
is four.

   1 : World
   2 : Sports
   3 : Business
   4 : Sci/Tec

The vocab size is equal to the length of vocab (including single word
and ngrams). The number of classes is equal to the number of labels,
which is four in AG_NEWS case.

In [3]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
type(train_dataset)

torchtext.datasets.text_classification.TextClassificationDataset

Functions used to generate batch
--------------------------------




Since the text entries have different lengths, a custom function
generate_batch() is used to generate data batches and offsets. The
function is passed to ``collate_fn`` in ``torch.utils.data.DataLoader``.
The input to ``collate_fn`` is a list of tensors with the size of
batch_size, and the ``collate_fn`` function packs them into a
mini-batch.

The text entries in the original data batch input are packed into a list
and concatenated as a single tensor as the input of ``nn.EmbeddingBag``.
The offsets is a tensor of delimiters to represent the beginning index
of the individual sequence in the text tensor. Label is a tensor saving
the labels of individual text entries.

In [4]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

Define functions to train the model and evaluate results.
---------------------------------------------------------




We use ``DataLoader`` here to load AG_NEWS datasets and send it to the
model for training/validation.




In [5]:
from torch.utils.data import DataLoader

def train_func(sub_train_,model,optimizer,scheduler,criterion):

    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()
    if(scheduler != None):
        scheduler.step()
    
    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_,model,optimizer,criterion):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

Split the dataset and run the model
-----------------------------------

Since the original AG_NEWS has no valid dataset, we split the training
dataset into train/valid sets with a split ratio of 0.95 (train) and
0.05 (valid). 

CrossEntropyLoss criterion combines nn.LogSoftmax() and nn.NLLLoss() in a single class.
It is useful when training a classification problem with C classes.

In [6]:
def run_optimizer(model,optimizer,name_file = "new",scheduler=None,N_EPOCHS=10):
    
    min_valid_loss = float('inf')
    
    criterion = torch.nn.CrossEntropyLoss().to(device)

    train_len = int(len(train_dataset) * 0.95)
    sub_train_, sub_valid_ = random_split(train_dataset, [train_len, len(train_dataset) - train_len])

    train_loss_tab = []
    train_acc_tab = []
    valid_loss_tab = []
    valid_acc_tab = []

    for epoch in range(N_EPOCHS):

        start_time = time.time()
        train_loss, train_acc = train_func(sub_train_,model,optimizer,scheduler,criterion)
        valid_loss, valid_acc = test(sub_valid_,model,optimizer,criterion)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
        print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
        print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
        
        train_loss_tab.append(train_loss)
        train_acc_tab.append(train_acc)
        valid_loss_tab.append(float(valid_loss))
        valid_acc_tab.append(valid_acc)

    
    dict_data = {
        'train_loss_tab':train_loss_tab,
        'train_acc_tab':train_acc_tab,
        'valid_loss_tab':valid_loss_tab,
        'valid_acc_tab':valid_acc_tab
    }

    df = pd.DataFrame(dict_data, columns= dict_data.keys())
    print(df)
    
    path = "D:\\dossier important 2020\\swa_gaussian-master\\optimizer_results\\" + name_file + ".csv"
    
    df.to_csv (path, index = False, header=True)
        
    return train_loss_tab,train_acc_tab, valid_loss_tab, valid_acc_tab

### Stochastic Gradient Descent

SGD implements stochastic gradient descent method as optimizer. The initial
learning rate is set to 4.0. 

StepLR is used here to adjust the learning rate through epochs.

In [7]:
name_file = "SGD_YelpReviewFull"
model_SGD = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)
optimizer_SGD = torch.optim.SGD(model_SGD.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer_SGD, 1, gamma=0.9)
train_loss_tab, train_acc_tab, valid_loss_tab, valid_acc_tab= run_optimizer(model_SGD,optimizer_SGD,name_file,scheduler)

Epoch: 1  | time in 2 minutes, 16 seconds
	Loss: 0.0704(train)	|	Acc: 51.5%(train)
	Loss: 0.0001(valid)	|	Acc: 55.6%(valid)
Epoch: 2  | time in 2 minutes, 14 seconds
	Loss: 0.0653(train)	|	Acc: 55.1%(train)
	Loss: 0.0001(valid)	|	Acc: 52.4%(valid)
Epoch: 3  | time in 2 minutes, 15 seconds
	Loss: 0.0635(train)	|	Acc: 56.4%(train)
	Loss: 0.0000(valid)	|	Acc: 57.5%(valid)
Epoch: 4  | time in 2 minutes, 14 seconds
	Loss: 0.0624(train)	|	Acc: 57.2%(train)
	Loss: 0.0000(valid)	|	Acc: 58.0%(valid)
Epoch: 5  | time in 2 minutes, 16 seconds
	Loss: 0.0613(train)	|	Acc: 58.1%(train)
	Loss: 0.0000(valid)	|	Acc: 57.0%(valid)
Epoch: 6  | time in 2 minutes, 13 seconds
	Loss: 0.0606(train)	|	Acc: 58.6%(train)
	Loss: 0.0000(valid)	|	Acc: 56.2%(valid)
Epoch: 7  | time in 2 minutes, 9 seconds
	Loss: 0.0599(train)	|	Acc: 59.2%(train)
	Loss: 0.0000(valid)	|	Acc: 58.6%(valid)
Epoch: 8  | time in 2 minutes, 7 seconds
	Loss: 0.0593(train)	|	Acc: 59.6%(train)
	Loss: 0.0000(valid)	|	Acc: 58.2%(valid)
Epoch: 9  

ValueError: not enough values to unpack (expected 6, got 4)

### Adagrad

In [8]:
name_file = "Adagrad_YelpReviewFull"
model_Adagrad = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)
optimizer_Adagrad = torch.optim.Adagrad(model_Adagrad.parameters(), lr=4.0)
train_loss_tab, train_acc_tab, valid_loss_tab, valid_acc_tab = run_optimizer(model_Adagrad,optimizer_Adagrad,name_file)

Epoch: 1  | time in 3 minutes, 51 seconds
	Loss: 0.0691(train)	|	Acc: 55.7%(train)
	Loss: 0.0000(valid)	|	Acc: 58.2%(valid)
Epoch: 2  | time in 3 minutes, 42 seconds
	Loss: 0.0543(train)	|	Acc: 64.8%(train)
	Loss: 0.0000(valid)	|	Acc: 58.3%(valid)
Epoch: 3  | time in 3 minutes, 34 seconds
	Loss: 0.0483(train)	|	Acc: 69.9%(train)
	Loss: 0.0000(valid)	|	Acc: 57.3%(valid)
Epoch: 4  | time in 3 minutes, 33 seconds
	Loss: 0.0434(train)	|	Acc: 73.7%(train)
	Loss: 0.0000(valid)	|	Acc: 56.4%(valid)
Epoch: 5  | time in 3 minutes, 34 seconds
	Loss: 0.0399(train)	|	Acc: 76.3%(train)
	Loss: 0.0000(valid)	|	Acc: 54.8%(valid)
Epoch: 6  | time in 3 minutes, 35 seconds
	Loss: 0.0373(train)	|	Acc: 78.0%(train)
	Loss: 0.0000(valid)	|	Acc: 54.6%(valid)
Epoch: 7  | time in 3 minutes, 39 seconds
	Loss: 0.0354(train)	|	Acc: 79.2%(train)
	Loss: 0.0001(valid)	|	Acc: 54.2%(valid)
Epoch: 8  | time in 3 minutes, 37 seconds
	Loss: 0.0339(train)	|	Acc: 80.1%(train)
	Loss: 0.0000(valid)	|	Acc: 53.4%(valid)
Epoch: 9

### CosineAnnealingLR

In [9]:
name_file = "SGD_scheduler_COSINE_YelpReviewFull"
model_SGD = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

optimizer_SGD = torch.optim.SGD(model_SGD.parameters(), lr=4.0)

scheduler_COSINE = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_SGD, T_max=10)

train_loss_tab, train_acc_tab, valid_loss_tab, valid_acc_tab = run_optimizer(model_SGD,optimizer_SGD,name_file,scheduler_COSINE)

Epoch: 1  | time in 2 minutes, 14 seconds
	Loss: 0.0704(train)	|	Acc: 51.5%(train)
	Loss: 0.0001(valid)	|	Acc: 53.4%(valid)
Epoch: 2  | time in 2 minutes, 15 seconds
	Loss: 0.0661(train)	|	Acc: 54.6%(train)
	Loss: 0.0001(valid)	|	Acc: 56.2%(valid)
Epoch: 3  | time in 2 minutes, 11 seconds
	Loss: 0.0643(train)	|	Acc: 55.9%(train)
	Loss: 0.0001(valid)	|	Acc: 57.4%(valid)
Epoch: 4  | time in 2 minutes, 11 seconds
	Loss: 0.0628(train)	|	Acc: 57.0%(train)
	Loss: 0.0001(valid)	|	Acc: 57.7%(valid)
Epoch: 5  | time in 2 minutes, 8 seconds
	Loss: 0.0613(train)	|	Acc: 58.1%(train)
	Loss: 0.0001(valid)	|	Acc: 58.2%(valid)
Epoch: 6  | time in 2 minutes, 7 seconds
	Loss: 0.0599(train)	|	Acc: 59.3%(train)
	Loss: 0.0001(valid)	|	Acc: 59.3%(valid)
Epoch: 7  | time in 2 minutes, 7 seconds
	Loss: 0.0586(train)	|	Acc: 60.3%(train)
	Loss: 0.0001(valid)	|	Acc: 56.5%(valid)
Epoch: 8  | time in 2 minutes, 13 seconds
	Loss: 0.0576(train)	|	Acc: 61.2%(train)
	Loss: 0.0001(valid)	|	Acc: 57.1%(valid)
Epoch: 9  |

## SWA

In [10]:
name_file = "SWA_YelpReviewFull"
model_SWA = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

base_opt = torch.optim.SGD(model_SWA.parameters(), lr=4.0)
optimizer_SWA = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)

train_loss_tab, train_acc_tab, valid_loss_tab, valid_acc_tab = run_optimizer(model_SWA,optimizer_SWA,name_file)

Epoch: 1  | time in 3 minutes, 4 seconds
	Loss: 0.0855(train)	|	Acc: 42.1%(train)
	Loss: 0.0001(valid)	|	Acc: 49.9%(valid)
Epoch: 2  | time in 3 minutes, 18 seconds
	Loss: 0.0715(train)	|	Acc: 52.1%(train)
	Loss: 0.0001(valid)	|	Acc: 54.7%(valid)
Epoch: 3  | time in 3 minutes, 7 seconds
	Loss: 0.0674(train)	|	Acc: 54.8%(train)
	Loss: 0.0001(valid)	|	Acc: 56.5%(valid)
Epoch: 4  | time in 3 minutes, 10 seconds
	Loss: 0.0653(train)	|	Acc: 56.2%(train)
	Loss: 0.0001(valid)	|	Acc: 57.4%(valid)
Epoch: 5  | time in 3 minutes, 7 seconds
	Loss: 0.0641(train)	|	Acc: 57.1%(train)
	Loss: 0.0001(valid)	|	Acc: 57.8%(valid)
Epoch: 6  | time in 3 minutes, 4 seconds
	Loss: 0.0632(train)	|	Acc: 57.7%(train)
	Loss: 0.0001(valid)	|	Acc: 58.4%(valid)
Epoch: 7  | time in 3 minutes, 6 seconds
	Loss: 0.0625(train)	|	Acc: 58.1%(train)
	Loss: 0.0001(valid)	|	Acc: 58.8%(valid)
Epoch: 8  | time in 3 minutes, 4 seconds
	Loss: 0.0620(train)	|	Acc: 58.4%(train)
	Loss: 0.0001(valid)	|	Acc: 58.9%(valid)
Epoch: 9  | ti