Exam variant 2, question 8

# 8. Why do we use Cross Entropy Loss for training a Language Model? 

Cross Entropy Loss measures the performance of a classification model whose output is a probability value between 0 and 1. Since Language Model is a probability distribution of sequences of words using for binary and multiclass classification, we use Cross Entropy Loss.

# Exam

In [0]:
import pandas as pd
import zipfile
import io
import numpy as np
import torch as tt
import torch.nn as nn
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator
import nltk
import torch.nn.functional as F
from tqdm import tqdm
from sklearn import metrics

In [118]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
np.random.seed(42)

In [0]:
zf = zipfile.ZipFile('exam_data.zip', "r")
zf.extractall()

In [0]:
import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

from spacy.symbols import ORTH

In [0]:
spacy_en.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "n't"}])

In [0]:
def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]

In [0]:
df_train = pd.read_csv('train.csv')

In [9]:
df_train.head()

Unnamed: 0,review,title,target
0,"The staff was very friendly, the breakfast ver...",Walker Gem,5
1,Excellent service - very approachable and prof...,Excellent Service,4
2,Really a top notch place to spend a day at the...,"Good location, warm and friendly staff",5
3,"a little noisy, there was a false fire alarm a...","nice hotel,",4
4,Place had too many animals and I'm allergic to...,Experience,3


In [0]:
df_train['target'] = df_train['target'].map({5: 'pos', 4: 'neg', 3: 'neg', 2: 'neg', 1: 'neg'})

In [0]:
del df_train['title']

In [0]:
df_train.rename(columns=lambda x: x.replace('target', 'label'), inplace=True)

In [60]:
df_train.head()

Unnamed: 0,review,label
0,"The staff was very friendly, the breakfast ver...",pos
1,Excellent service - very approachable and prof...,neg
2,Really a top notch place to spend a day at the...,pos
3,"a little noisy, there was a false fire alarm a...",neg
4,Place had too many animals and I'm allergic to...,neg


In [0]:
df_train.to_csv('train_new.csv', encoding='utf-8')

In [0]:
df_test = pd.read_csv('test.csv')

In [15]:
df_test.head()

Unnamed: 0,review,title,target
0,"I am from old town, and I stayed in this hotel...",Incredible Hotel,5
1,We have been coming to the Ocean Park Inn for ...,We Love this beach front Inn,5
2,Perfect place for a quick get away. We had a q...,Love this place!,5
3,"The room was not the best however, it was good...",Good For One Night Stay...,2
4,Sous le motif d'une priode hivernale (inaccept...,Moyen,3


In [0]:
df_test['target'] = df_test['target'].map({5: 'pos', 4: 'neg', 3: 'neg', 2: 'neg', 1: 'neg'})

In [0]:
del df_test['title']

In [0]:
df_test.rename(columns=lambda x: x.replace('target', 'label'), inplace=True)

In [63]:
df_test.head()

Unnamed: 0,review,label
0,"I am from old town, and I stayed in this hotel...",pos
1,We have been coming to the Ocean Park Inn for ...,pos
2,Perfect place for a quick get away. We had a q...,pos
3,"The room was not the best however, it was good...",neg
4,Sous le motif d'une priode hivernale (inaccept...,neg


In [0]:
df_test.to_csv('test_new.csv', encoding='utf-8')

In [0]:
dataset = pd.concat([df_train, df_test])

In [66]:
dataset.head()

Unnamed: 0,review,label
0,"The staff was very friendly, the breakfast ver...",pos
1,Excellent service - very approachable and prof...,neg
2,Really a top notch place to spend a day at the...,pos
3,"a little noisy, there was a false fire alarm a...",neg
4,Place had too many animals and I'm allergic to...,neg


In [0]:
dataset.to_csv('dataset.csv', encoding='utf-8')

In [0]:
classes={
    'neg':0,
    'pos':1
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))

LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = TabularDataset('dataset.csv', format='csv',
                         fields=[(None, None), ('review', TEXT), ('label', LABEL)], 
                         skip_header=True)

train = TabularDataset('train_new.csv', format='csv',
                         fields=[(None, None), ('review', TEXT), ('label', LABEL)], 
                         skip_header=True)

test = TabularDataset('test_new.csv', format='csv',
                         fields=[(None, None), ('review', TEXT), ('label', LABEL)], 
                         skip_header=True)

In [0]:
TEXT.build_vocab(dataset, min_freq=10, vectors="glove.6B.100d")

In [124]:
len(TEXT.vocab.itos)

7854

In [0]:
LABEL.build_vocab(dataset)

In [0]:
train, valid = train.split(0.7, stratified=True)

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        self.relu = tt.nn.ReLU(inplace=True)
        self.drop_out = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_size * len(kernels), 2)
        
    def forward(self, x):
        
        x = self.embedding(x)
        x = self.drop_out(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = tt.cat(concatenated, 1)
        x = self.relu(x) 
        x = self.fc(x)
        return x

In [0]:
device = tt.device('cuda') if tt.cuda.is_available() else tt.device('cpu')

In [0]:
model = MyModel(len(TEXT.vocab.itos), embed_size=100, hidden_size=128, kernels=[2,3,4,5])

In [130]:
model

MyModel(
  (embedding): Embedding(7854, 100)
  (convs): ModuleList(
    (0): Conv1d(100, 128, kernel_size=(2,), stride=(1,), padding=(5,))
    (1): Conv1d(100, 128, kernel_size=(3,), stride=(1,), padding=(5,))
    (2): Conv1d(100, 128, kernel_size=(4,), stride=(1,), padding=(5,))
    (3): Conv1d(100, 128, kernel_size=(5,), stride=(1,), padding=(5,))
  )
  (relu): ReLU(inplace=True)
  (drop_out): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=512, out_features=2, bias=True)
)

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = tt.optim.Adam(params=model.parameters())

model = model.to(device)
criterion = criterion.to(device)

In [0]:
tt.cuda.empty_cache()

batch_size = 32
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.review))

In [133]:
epochs = 10
losses = []      
best_val_loss = 10.

acc_score = []

for n_epoch in range(epochs):
  train_losses = []
  val_losses = []
  val_targets = []
  val_pred_class = []

  progress_bar = tqdm(total=len(train_iterator.dataset), desc='Epoch {}'.format(n_epoch + 1))
              
  model.train()

  for batch in train_iterator:
    x = batch.review[0].to(device)
    y = batch.label.to(device)

    optimizer.zero_grad()

    pred = model(x)
    loss = criterion(pred, y)
    loss.backward() 
                  
    optimizer.step()
      
    train_losses.append(loss.item())
    losses.append(loss.item())
  
    progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))
    progress_bar.update(x.shape[0])
          
  progress_bar.close()
              
  model.eval()

  for batch in valid_iterator:
    x = batch.review[0].to(device)
    y = batch.label
                  
    with tt.no_grad():
      val_pred = model(x)
      val_pred = val_pred.cpu()
      val_targets.append(y.numpy())
      val_pred_class.append(np.argmax(val_pred, axis=1))
      val_loss = criterion(val_pred, y)
      val_losses.append(val_loss.item())
              
  mean_val_loss = np.mean(val_losses)            
  val_targets = np.concatenate(val_targets).squeeze()
  val_pred_class = np.concatenate(val_pred_class).squeeze()

  acc = metrics.accuracy_score(val_targets, val_pred_class)
  acc_score.append(acc)
              
  print('Losses: train - {:.3f}, valid - {:.3f}'.format(np.mean(train_losses), mean_val_loss))

  print('Accuracy score - {:.3f}'.format(acc))
        
  # Early stopping:
  if mean_val_loss < best_val_loss:
    best_val_loss = mean_val_loss
  else:
    print('Early stopping')
    break

Epoch 1: 100%|██████████| 33735/33735 [00:12<00:00, 2690.39it/s, train_loss=0.555]
Epoch 2:   1%|          | 352/33735 [00:00<00:26, 1279.84it/s, train_loss=0.556]

Losses: train - 0.602, valid - 0.554
Accuracy score - 0.747


Epoch 2: 100%|██████████| 33735/33735 [00:12<00:00, 2723.99it/s, train_loss=0.505]


Losses: train - 0.518, valid - 0.646
Accuracy score - 0.746
Early stopping


In [0]:
def test_model(model, test_iterator):
  model.eval()
  
  test_targets = []
  test_pred_class = []
  
  for batch in test_iterator:
      x = batch.review[0].to(device)
      y = batch.label

      with tt.no_grad():
          test_pred = model(x)

          test_pred= test_pred.cpu()

          test_targets.append(y.numpy())
          test_pred_class.append(np.argmax(test_pred, axis=1))

  test_targets = np.concatenate(test_targets).squeeze()
  test_pred_class = np.concatenate(test_pred_class).squeeze()

  acc = metrics.accuracy_score(test_targets, test_pred_class)
  f1 = metrics.f1_score(test_targets, test_pred_class, average='binary')
  print(f'Accuracy: {acc}')
  print(f'F1 score: {f1}')

In [135]:
test_model(model, test_iterator)

Accuracy: 0.7436041083099907
F1 score: 0.7166150670794633


In [0]:
for instance in list(tqdm._instances): 
    tqdm._decr_instances(instance)