## Inspirations: https://github.com/bentrevett/pytorch-sentiment-analysis

In [185]:
import torch
import torch.nn as nn
import torchtext.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from datetime import datetime
from pathlib import Path
import pandas as pd

In [186]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [187]:
cd 'drive/My Drive/AppliedNLP/HW3'

[Errno 2] No such file or directory: 'drive/My Drive/AppliedNLP/HW3'
/content/drive/My Drive/AppliedNLP/HW3


### Data Preprocessing 

In [188]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [189]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9346 entries, 0 to 9345
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      9346 non-null   int64 
 1   text    9346 non-null   object
 2   target  9346 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 219.2+ KB


In [190]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3894 entries, 0 to 3893
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3894 non-null   int64 
 1   text    3894 non-null   object
dtypes: int64(1), object(1)
memory usage: 61.0+ KB


In [191]:
train.drop(['id'],axis=1,inplace= True)
test.drop(['id'],axis=1,inplace= True)

In [192]:
train.columns = ['data', 'labels']
test.columns = ['data']

In [193]:
train.to_csv('train2.csv', index=False)
test.to_csv('test2.csv', index=False)

In [194]:
TEXT = ttd.Field(
    sequential=True,
    batch_first=True,
    lower=True,
    tokenize='spacy',
    pad_first=True)

LABEL = ttd.LabelField(dtype = torch.float, batch_first=True)

#Train dataset
Train_dataset = ttd.TabularDataset(
    path='train2.csv',
    format='csv',
    skip_header=True,
    fields=[('data', TEXT),('label', LABEL)]
)

#Test dataset
Test_dataset = ttd.TabularDataset(
    path='test2.csv',
    format='csv',
    skip_header=True,
    fields=[('data', TEXT)]
)

In [195]:
ex = Train_dataset.examples[0]

In [196]:
type(ex)

torchtext.data.example.Example

In [197]:
ex.data

['@user',
 'she',
 'should',
 'ask',
 'a',
 'few',
 'native',
 'americans',
 'what',
 'their',
 'take',
 'on',
 'this',
 'is',
 '.']

In [198]:
ex.label

'1'

### Splitting into Train and Test

In [199]:
import random
SEED=1234
training_dataset, testing_dataset = Train_dataset.split(split_ratio=0.7,random_state = random.seed(SEED)) # default is 0.7

In [200]:
SEED=1234
training_dataset, valid_dataset = training_dataset.split(random_state = random.seed(SEED)) # default is 0.7
#training_dataset, valid_dataset = Train_dataset.split(random_state = random.seed(SEED)) # default is 0.7

In [201]:
print(f'Number of training examples: {len(training_dataset)}')
print(f'Number of validation examples: {len(valid_dataset)}')
print(f'Number of testing examples: {len(testing_dataset)}')

Number of training examples: 4579
Number of validation examples: 1963
Number of testing examples: 2804


### Using Pretrained Embeddings from Glove

In [202]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(training_dataset, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)


In [203]:
#TEXT.build_vocab(training_dataset)
LABEL.build_vocab(training_dataset)

In [204]:
vocab_text = TEXT.vocab

In [205]:
#vocab_text.stoi

In [206]:
#vocab_text.itos

In [207]:
len(vocab_text)

11155

In [208]:
vocab_label = LABEL.vocab

In [209]:
vocab_label.stoi

defaultdict(<function torchtext.vocab._default_unk_index>, {'0': 0, '1': 1})

In [210]:
vocab_label.itos

['0', '1']

In [211]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [212]:
train_iter, valid_iter = ttd.BucketIterator.splits((training_dataset,valid_dataset), 
                              sort_key=lambda x: len(x.data),
                              #sort_key=None,
                              sort_within_batch = True,
                              batch_size=64, 
                              device=device)

In [213]:
test_iter = ttd.BucketIterator(testing_dataset, 
                              sort_key=lambda x: len(x.data),
                              #sort_key=None,
                              sort_within_batch = True,
                              batch_size=64, 
                              device=device)

In [214]:
testing_iter = ttd.BucketIterator(Test_dataset, 
                              sort_key=lambda x: len(x.data),
                              #sort_key=None,
                              sort_within_batch = True,
                              batch_size=64, 
                              device=device)

In [215]:
for batch in train_iter:
  print("inputs:", batch.data, batch.data.shape)
  print("targets:",batch.label, "shape:", batch.label.shape)
  break

inputs: tensor([[    2,    44,  8230, 10418,     3,    24],
        [    2,    18,     5,  1007,  1007,  1007],
        [    2,    37,    32,    88,   320,     3],
        [    2,  1380,    13,     4,  7451,    11],
        [    2,    50,    19,    96,    35,    11],
        [    2,   550,    21,    10,   215,  9565],
        [    2,     2,    20,     5,    37,   594],
        [    2,     7,    15,   796,     3,   371],
        [    2,     2,    18,     5,  6411,    11],
        [    2,     2,     2,    18,     5,  1367],
        [    2,     2,  9117,   631,   220,    16],
        [    2,  1624,   128,   173,  3002,     3],
        [    2,     2,    18,     5,   593,  1089],
        [    2,     2,   718,   113,  9245,   213],
        [    2,     2, 11026,    15,     4,   243],
        [    2,     2,  8030,   124,   435,    11],
        [    2,     4,   481,   119,    33,    39],
        [    2,  1557,     5,    75,   770,    11],
        [    1,     2,   223,    86,    16,   179],
    

In [216]:
for batch in valid_iter:
  print("inputs:", batch.data, batch.data.shape)
  print("targets:",batch.label, "shape:", batch.label.shape)
  break

inputs: tensor([[   2,  198,   18,    5,    3],
        [   2,   18,    5,   37,  594],
        [   2,    2,  109, 1448,    0],
        [   2,   20,    5,   10,  356],
        [   2, 1608, 2864,    0, 2510],
        [   2,   44, 3506, 3542,   94],
        [   2,    2,  953,   16,  586],
        [   2,   18,    5,    0,   11],
        [   2,  157,    7,  111,    2],
        [   2,    0,   13,  593,   11],
        [   2,   20,    5,    0,  368],
        [   2,    0,   11,   11,   11],
        [   2,    2,  198,   45,   93],
        [   2,   48,   18,    5,    3],
        [   1,    2,  448,   21,  982],
        [   1,    2,    7,   15,  382],
        [   1,    2,  671, 9691,    3],
        [   1,    2,  357,   49,    0],
        [   1,    2,  348, 1867, 1531],
        [   1,    2,    0,   32,    3],
        [   1,    2, 1939,  123,  232],
        [   1,    2,   10, 1840, 3189],
        [   1,    2,   48,   33,   39],
        [   1,    2,   20,    5,  418],
        [   1,    2,  429,    0,

In [217]:
for batch in test_iter:
  print("inputs:", batch.data[0], batch.data[0].shape)
  print("targets:",batch.label, "shape:", batch.label.shape)
  break

inputs: tensor([   2,    2,    2,    2,    2,   14,   93,   26, 1216,  232,   11,  390,
          71,  288,  195,  172,   11, 1610,    5,   71, 1537, 2316,   48,   14,
         205,  733,  109,   68,   19,   11,   44,   15,    4,  555,   11,   29,
          88,  376,   95,  788,    4,    0,   29,   82,  124,   35, 2481,  879,
         185,  828,    9,  284,    0,   11, 2648,  284,    0,    5,   26,  174,
           0,    3,  168,   72,   11], device='cuda:0') torch.Size([65])
targets: tensor([1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1.,
        0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        1., 1., 1., 1., 1., 1., 0., 0., 1., 0.], device='cuda:0') shape: torch.Size([64])


## LSTM Model

In [218]:
# Define the model
class RNN(nn.Module):
  def __init__(self, n_vocab, embed_dim, n_hidden, n_rnnlayers, n_outputs):
    super(RNN, self).__init__()
    self.V = n_vocab
    self.D = embed_dim
    self.M = n_hidden
    self.K = n_outputs
    self.L = n_rnnlayers

    self.embed = nn.Embedding(self.V, self.D)
    self.rnn = nn.LSTM(
        input_size=self.D,
        hidden_size=self.M,
        num_layers=self.L,
        batch_first=True)
    self.fc = nn.Linear(self.M, self.K)
  
  def forward(self, X):
    # initial hidden states
    h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
    c0 = torch.zeros(self.L, X.size(0), self.M).to(device)

    # embedding layer
    # turns word indexes into word vectors
    out = self.embed(X)

    # get RNN unit output
    out, _ = self.rnn(out, (h0, c0))

    # max pool
    out, _ = torch.max(out, 1)

    # we only want h(T) at the final time step
    out = self.fc(out)
    return out

In [219]:
model = RNN(len(vocab_text), 100, 100, 1, 1)
model.to(device)

RNN(
  (embed): Embedding(11155, 100)
  (rnn): LSTM(100, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)

In [220]:
learning_rate = 0.01
epochs=10
# STEP 5: INSTANTIATE LOSS CLASS
criterion = nn.BCEWithLogitsLoss()

# STEP 6: INSTANTIATE OPTIMIZER CLASS

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# STEP 7: TRAIN THE MODEL

train_losses= np.zeros(epochs)
valid_losses= np.zeros(epochs)


for epoch in range(epochs):
  
  t0= datetime.now()
  train_loss=[]
  
  model.train()
  for batch in train_iter:
   

    # forward pass
    output= model(batch.data)
    batch.label = batch.label.unsqueeze(1)
    batch.label = batch.label.float()
    loss=criterion(output,batch.label)

    # set gradients to zero 
    optimizer.zero_grad()

    # backward pass
    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())
  
  train_loss=np.mean(train_loss)
      
  valid_loss=[]
  model.eval()
  with torch.no_grad():
    for batch in valid_iter:
 
      # forward pass
      output= model(batch.data)
      batch.label = batch.label.unsqueeze(1)
      batch.label = batch.label.float()
      loss=criterion(output,batch.label)
      valid_loss.append(loss.item())

    valid_loss=np.mean(valid_loss)
  
  # save Losses
  train_losses[epoch]= train_loss
  valid_losses[epoch]= valid_loss
  dt= datetime.now()-t0
  print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}    Valid Loss: {valid_loss:.4f}, Duration: {dt}')

Epoch 1/10, Train Loss: 0.6286    Valid Loss: 0.5879, Duration: 0:00:00.415046
Epoch 2/10, Train Loss: 0.4566    Valid Loss: 0.5630, Duration: 0:00:00.380345
Epoch 3/10, Train Loss: 0.2359    Valid Loss: 0.6512, Duration: 0:00:00.383875
Epoch 4/10, Train Loss: 0.0866    Valid Loss: 0.8689, Duration: 0:00:00.368160
Epoch 5/10, Train Loss: 0.0313    Valid Loss: 1.2934, Duration: 0:00:00.376413
Epoch 6/10, Train Loss: 0.0134    Valid Loss: 1.3450, Duration: 0:00:00.367673
Epoch 7/10, Train Loss: 0.0132    Valid Loss: 1.3078, Duration: 0:00:00.358503
Epoch 8/10, Train Loss: 0.0055    Valid Loss: 1.4205, Duration: 0:00:00.365775
Epoch 9/10, Train Loss: 0.0037    Valid Loss: 1.5629, Duration: 0:00:00.375591
Epoch 10/10, Train Loss: 0.0027    Valid Loss: 1.5847, Duration: 0:00:00.363269


In [221]:
# Accuracy- write a function to get accuracy
# use this function to get accuracy and print accuracy
def get_accuracy(data_iter, model):
  model.eval()
  with torch.no_grad():
    correct =0 
    total =0
    
    for batch in data_iter:

      output=model(batch.data)
      _,indices = torch.max(output,dim=1)
      correct+= (batch.label==indices).sum().item()
      total += batch.label.shape[0]
    
    acc= correct/total

    return acc

In [222]:
train_acc = get_accuracy(train_iter, model)
valid_acc = get_accuracy(valid_iter, model)
test_acc = get_accuracy(test_iter ,model)
print(f'Train acc: {train_acc:.4f},\t Valid acc: {valid_acc:.4f},\t Test acc: {test_acc:.4f}')

Train acc: 0.6626,	 Valid acc: 0.6719,	 Test acc: 0.6658


In [223]:
# Write a function to get predictions

def get_predictions(test_iter, model):
  model.eval()
  with torch.no_grad():
    predictions= np.array([])
    y_test= np.array([])

    for batch in test_iter:
      
      output=model(batch.data)
      _,indices = torch.max(output,dim=1)
      predictions=np.concatenate((predictions,indices.cpu().numpy())) 
      y_test = np.concatenate((y_test,batch.label.cpu().numpy())) 
      
  return y_test, predictions

#### Exporting output to CSV

In [229]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_1(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [230]:
test_predictions = []
for text in test['data']:
  test_predictions.append(predict_sentiment_1(model,text))

In [231]:
# Rounding off predictions to 0 and 1
test_predictions_rounded = [round(pred) for pred in test_predictions]

In [137]:
test = pd.read_csv("test.csv", encoding='ISO-8859-1')
test['Target'] = test_predictions_rounded
test.to_csv('Predictions_LSTM_1.csv', index=False)

## LSTM Model - Bidirectional with dropout

In [232]:
# Define the LSTM model
class RNN(nn.Module):
  def __init__(self, n_vocab, embed_dim, n_hidden, n_rnnlayers, n_outputs, bidirectional, dropout_rate):
    super(RNN, self).__init__()
    self.V = n_vocab
    self.D = embed_dim
    self.M = n_hidden
    self.K = n_outputs
    self.L = n_rnnlayers
    self.num_diections= bidirectional
    self.dropout_rate=dropout_rate
    
    # embedding layer
    self.embed = nn.Embedding(self.V, self.D)
    
    # rnn layers
    self.rnn = nn.LSTM(
        input_size=self.D,
        hidden_size=self.M,
        num_layers=self.L,
        bidirectional=self.num_diections,
        dropout= self.dropout_rate,
        batch_first=True)
    
    # dense layer
    self.fc = nn.Linear(self.M *2 , self.K)

    # dropout layer
    self.dropout= nn.Dropout(self.dropout_rate)
  
  def forward(self, X):
    # initial hidden states
    h0 = torch.zeros(self.L*2, X.size(0), self.M).to(device)
    c0 = torch.zeros(self.L*2, X.size(0), self.M).to(device)

    # embedding layer
    # turns word indexes into word vectors
    # X (batch_size, sentence length)
    embedding = self.embed(X)   # (batch_size, sentence_length, emd_dim)
    embedding= self.dropout(embedding) # (batch_size, sentence_length, emd_dim)

    # get RNN unit output
    output, (hidden,cell) = self.rnn(embedding, (h0, c0))


    #output = [batch size, sent len, hid dim * num directions]
    #hidden = [num layers * num directions, batch size, hid dim]
    #cell = [num layers * num directions, batch size, hid dim]

    # max pool
    output, _ = torch.max(output, 1)
    output= self.dropout(output)
    # we only want h(T) at the final time step
    output = self.fc(output)
    return output

In [233]:
n_vocab = len(TEXT.vocab)
embed_dim = 100
n_hidden = 256 
n_rnnlayers = 2
n_outputs = 1 
bidirectional = True 
dropout_rate = 0.5

In [234]:
model_LSTM = RNN(n_vocab, embed_dim, n_hidden, n_rnnlayers, n_outputs, bidirectional, dropout_rate)
model_LSTM.to(device)

RNN(
  (embed): Embedding(11155, 100)
  (rnn): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [235]:
print(model_LSTM)

RNN(
  (embed): Embedding(11155, 100)
  (rnn): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [236]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([11155, 100])


#### Training Loop

In [237]:
learning_rate = 0.001
epochs=10
# STEP 5: INSTANTIATE LOSS CLASS
criterion = nn.BCEWithLogitsLoss()

# STEP 6: INSTANTIATE OPTIMIZER CLASS

optimizer = torch.optim.Adam(model_LSTM.parameters(), lr=learning_rate)

# Freeze embedding Layer

#freeze embeddings
model_LSTM.embed.weight.requires_grad  = False

# STEP 7: TRAIN THE MODEL

train_losses= np.zeros(epochs)
valid_losses= np.zeros(epochs)


for epoch in range(epochs):
  
  t0= datetime.now()
  train_loss=[]
  
  model_LSTM.train()
  for batch in train_iter:
   
    # forward pass
    output= model_LSTM(batch.data)
    batch.label = batch.label.unsqueeze(1)
    batch.label = batch.label.float()
    loss=criterion(output,batch.label)

    # set gradients to zero 
    optimizer.zero_grad()

    # backward pass
    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())
  
  train_loss=np.mean(train_loss)
      
  valid_loss=[]
  model_LSTM.eval()
  with torch.no_grad():
    for batch in valid_iter:
 
      # forward pass
      output= model_LSTM(batch.data)
      batch.label = batch.label.unsqueeze(1)
      #batch.label = batch.label.float()
      loss=criterion(output,batch.label)
      
      valid_loss.append(loss.item())

    valid_loss=np.mean(valid_loss)
  
  # save Losses
  train_losses[epoch]= train_loss
  valid_losses[epoch]= valid_loss
  dt= datetime.now()-t0
  print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}    Valid Loss: {valid_loss:.4f}, Duration: {dt}')

Epoch 1/10, Train Loss: 0.6389    Valid Loss: 0.6138, Duration: 0:00:01.219322
Epoch 2/10, Train Loss: 0.6263    Valid Loss: 0.6269, Duration: 0:00:01.124877
Epoch 3/10, Train Loss: 0.6102    Valid Loss: 0.5918, Duration: 0:00:01.123280
Epoch 4/10, Train Loss: 0.5995    Valid Loss: 0.5905, Duration: 0:00:01.131024
Epoch 5/10, Train Loss: 0.5791    Valid Loss: 0.5839, Duration: 0:00:01.134628
Epoch 6/10, Train Loss: 0.5667    Valid Loss: 0.5809, Duration: 0:00:01.134667
Epoch 7/10, Train Loss: 0.5561    Valid Loss: 0.5774, Duration: 0:00:01.143747
Epoch 8/10, Train Loss: 0.5407    Valid Loss: 0.5996, Duration: 0:00:01.169770
Epoch 9/10, Train Loss: 0.5200    Valid Loss: 0.5753, Duration: 0:00:01.143727
Epoch 10/10, Train Loss: 0.5091    Valid Loss: 0.5786, Duration: 0:00:01.171825


In [239]:
# Accuracy- write a function to get accuracy
# use this function to get accuracy and print accuracy
def get_accuracy(data_iter, model):
  model.eval()
  with torch.no_grad():
    correct =0 
    total =0
    
    for batch in data_iter:

      output=model(batch.data)
      _,indices = torch.max(output,dim=1)
      correct+= (batch.label==indices).sum().item()
      total += batch.label.shape[0]
    
    acc= correct/total

    return acc

In [240]:
# Checking accuracy on Train, Validation and Test Datasets
train_acc = get_accuracy(train_iter, model_LSTM)
valid_acc = get_accuracy(valid_iter, model_LSTM)
test_acc = get_accuracy(test_iter ,model_LSTM)
print(f'Train acc: {train_acc:.4f},\t Valid acc: {valid_acc:.4f},\t Test acc: {test_acc:.4f}')

Train acc: 0.6626,	 Valid acc: 0.6719,	 Test acc: 0.6658


#### Exporting output to CSV

In [247]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_1(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [248]:
test_predictions = []
for text in test['data']:
  test_predictions.append(predict_sentiment_1(model_LSTM,text))

In [249]:
# Rounding off predictions to 0 and 1
test_predictions_rounded = [round(pred) for pred in test_predictions]

In [250]:
test = pd.read_csv("test.csv", encoding='ISO-8859-1')
test['Target'] = test_predictions_rounded
test.to_csv('Predictions_LSTM_BD.csv', index=False)

## Vanilla RNN

In [251]:
# Creating a Vanilla RNN Function
import torch.nn as nn

class VanillaRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        embedded = self.embedding(text)
        #print(embedded.shape)
        #embedded = [sent len, batch size, emb dim]
        output, hidden = self.rnn(embedded)
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        #assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [252]:
len(TEXT.vocab)

11155

In [253]:
print(TEXT.vocab.vectors)

tensor([[-0.6075, -0.8885, -0.3198,  ...,  0.9384,  0.3964, -0.0569],
        [ 0.4732,  0.1814,  0.2208,  ..., -2.4314, -0.5819,  1.7075],
        [-2.2917,  1.7018,  1.2485,  ...,  0.9472,  0.7102, -0.9532],
        ...,
        [ 1.3456,  1.4064,  1.9149,  ...,  1.3751, -0.0479, -0.8710],
        [ 0.3412,  0.6755,  0.1827,  ..., -0.8494,  0.8603, -0.8099],
        [-0.7106, -0.6365, -0.1531,  ..., -1.1898,  0.5334,  0.1502]])


In [254]:
embeddings = TEXT.vocab.vectors
#VanillaRNN.embedding.weight.data.copy_(embeddings)
print(embeddings.shape)

torch.Size([11155, 100])


In [255]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
learning_rate = 0.005

model_VRNN = VanillaRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [256]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model_VRNN):,} trainable parameters')

The model has 1,207,405 trainable parameters


In [258]:
import torch.optim as optim

# Defining Optimizer
optimizer = optim.Adam(model_VRNN.parameters(), lr=1e-3)

# Defining Loss Function
criterion = nn.BCEWithLogitsLoss()

# Pushing Model to GPU
model_VRNN = model_VRNN.to(device)
criterion = criterion.to(device)

In [259]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [260]:
# Defining training loop
def train_func(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.data).squeeze(1)
        #label = label.float()
        #print(len(predictions))
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [261]:
# Defining evaluation loop
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.data).squeeze(1)
            #label = label.float()
            #print(type(batch.label))
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [262]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#### Training the model

In [263]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_func(model_VRNN, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model_VRNN, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_VRNN.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.657 | Train Acc: 62.94%
	 Val. Loss: 0.660 |  Val. Acc: 61.45%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.619 | Train Acc: 67.39%
	 Val. Loss: 0.651 |  Val. Acc: 66.87%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.580 | Train Acc: 70.22%
	 Val. Loss: 0.660 |  Val. Acc: 64.10%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.569 | Train Acc: 71.64%
	 Val. Loss: 0.670 |  Val. Acc: 62.79%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.523 | Train Acc: 74.27%
	 Val. Loss: 0.694 |  Val. Acc: 64.02%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.457 | Train Acc: 78.59%
	 Val. Loss: 0.710 |  Val. Acc: 65.36%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.395 | Train Acc: 82.16%
	 Val. Loss: 0.776 |  Val. Acc: 63.65%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.327 | Train Acc: 86.38%
	 Val. Loss: 0.893 |  Val. Acc: 58.88%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.261 | Train Acc: 89.84%
	 Val. Loss: 0.987 |  Val. Acc: 60.39%
Epoch: 10 | Epoch Time: 0m 0

In [264]:
# Checking accuracy on Test Dataset
model_VRNN.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model_VRNN, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.657 | Test Acc: 66.67%


#### Exporting output to CSV

In [265]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_2(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    #length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    #length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [268]:
test_predictions = []
for text in test['text']:
  test_predictions.append(predict_sentiment_2(model_VRNN,text))

In [269]:
test_predictions_rounded = [round(pred) for pred in test_predictions]

In [270]:
test = pd.read_csv("test.csv", encoding='ISO-8859-1')
test['Target'] = test_predictions_rounded
test.to_csv('Predictions_VRNN.csv', index=False)

## Bidirectional RNN

#### Data Preprocessing

In [271]:
TEXT = ttd.Field(
    sequential=True,
    #batch_first=True,
    lower=True,
    tokenize='spacy',
    #pad_first=True,
    include_lengths = True)

LABEL = ttd.LabelField(dtype = torch.float)#, batch_first=True)

#Train dataset
Train_dataset = ttd.TabularDataset(
    path='train2.csv',
    format='csv',
    skip_header=True,
    fields=[('data', TEXT),('label', LABEL)]
)

#Test dataset
Test_dataset = ttd.TabularDataset(
    path='test2.csv',
    format='csv',
    skip_header=True,
    fields=[('data', TEXT)]
)

In [272]:
SEED=1234
training_dataset, testing_dataset = Train_dataset.split(split_ratio=0.7,random_state = random.seed(SEED)) # default is 0.7

In [273]:
import random
SEED=1234
training_dataset, valid_dataset = training_dataset.split(random_state = random.seed(SEED)) # default is 0.7

In [274]:
print(f'Number of training examples: {len(training_dataset)}')
print(f'Number of validation examples: {len(valid_dataset)}')
print(f'Number of testing examples: {len(testing_dataset)}')

Number of training examples: 4579
Number of validation examples: 1963
Number of testing examples: 2804


In [275]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(training_dataset, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)


In [276]:
#TEXT.build_vocab(training_dataset)
LABEL.build_vocab(training_dataset)

In [277]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter = ttd.BucketIterator.splits((training_dataset,valid_dataset), 
                              sort_key=lambda x: len(x.data),
                              #sort_key=None,
                              sort_within_batch = True,
                              batch_size=64, 
                              device=device)

In [278]:
test_iter = ttd.BucketIterator(testing_dataset, 
                              sort_key=lambda x: len(x.data),
                              #sort_key=None,
                              sort_within_batch = True,
                              batch_size=64, 
                              device=device)

#### Defining the Bidirectional function

In [279]:
import torch.nn as nn

class BiDRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [280]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model_BDRNN = BiDRNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [281]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model_BDRNN):,} trainable parameters')

The model has 3,426,157 trainable parameters


In [282]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([11155, 100])


In [283]:
model_BDRNN.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.2644,  0.1614,  1.9024,  ..., -0.0895,  0.0145,  0.3330],
        [ 0.8016,  0.1560,  0.7757,  ..., -1.2540, -1.0613, -0.6615],
        [-0.1094,  0.9084,  2.0204,  ...,  0.2334, -0.8108,  0.4120],
        ...,
        [ 1.0947, -0.9882,  0.3398,  ...,  2.3886, -0.1666,  0.2511],
        [-0.2642,  0.1371, -1.1057,  ..., -0.1192,  0.1426, -1.4332],
        [-0.3453,  1.8291,  0.3241,  ..., -0.0213,  0.8426, -1.3327]])

In [284]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model_BDRNN.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model_BDRNN.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model_BDRNN.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1094,  0.9084,  2.0204,  ...,  0.2334, -0.8108,  0.4120],
        ...,
        [ 1.0947, -0.9882,  0.3398,  ...,  2.3886, -0.1666,  0.2511],
        [-0.2642,  0.1371, -1.1057,  ..., -0.1192,  0.1426, -1.4332],
        [-0.3453,  1.8291,  0.3241,  ..., -0.0213,  0.8426, -1.3327]])


In [285]:
import torch.optim as optim

optimizer = optim.Adam(model_BDRNN.parameters())

criterion = nn.BCEWithLogitsLoss()

model_BDRNN = model_BDRNN.to(device)
criterion = criterion.to(device)

In [286]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [287]:
def train_func2(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.data
        #print("Length of texts: ",len(text))
        #print('Lenghts of text sequences: ',len(text_lengths))
        #print("Length of labels: ",len(batch.label))
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [288]:
def evaluate2(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.data
            #print("Length of texts: ",len(text))
            #print('Lenghts of text sequences: ',len(text_lengths))
            #print("Length of labels: ",len(batch.label))
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [289]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#### Training the model

In [290]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_func2(model_BDRNN, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate2(model_BDRNN, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_BDRNN.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 0.634 | Train Acc: 67.07%
	 Val. Loss: 0.626 |  Val. Acc: 67.35%
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 0.587 | Train Acc: 70.42%
	 Val. Loss: 0.561 |  Val. Acc: 69.29%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 0.563 | Train Acc: 71.76%
	 Val. Loss: 0.564 |  Val. Acc: 71.40%
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 0.517 | Train Acc: 76.04%
	 Val. Loss: 0.524 |  Val. Acc: 74.73%
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 0.478 | Train Acc: 78.66%
	 Val. Loss: 0.516 |  Val. Acc: 75.39%
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 0.456 | Train Acc: 79.15%
	 Val. Loss: 0.530 |  Val. Acc: 75.81%
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.427 | Train Acc: 80.82%
	 Val. Loss: 0.516 |  Val. Acc: 75.36%
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.385 | Train Acc: 83.47%
	 Val. Loss: 0.515 |  Val. Acc: 76.69%
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.356 | Train Acc: 85.07%
	 Val. Loss: 0.518 |  Val. Acc: 76.37%
Epoch: 10 | Epoch Time: 0m 1

In [291]:
model_BDRNN.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate2(model_BDRNN, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.506 | Test Acc: 77.46%


In [292]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_1(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [294]:
test_predictions = []
for text in test['text']:
  test_predictions.append(predict_sentiment_1(model_BDRNN,text))

In [295]:
# Rounding off to 0 and 1
test_predictions_rounded = [round(pred) for pred in test_predictions]

In [296]:
test = pd.read_csv("test.csv", encoding='ISO-8859-1')
test['Target'] = test_predictions_rounded
test.to_csv('Predictions_BiDRNN.csv', index=False)

## CNN

#### Data Preprocessing

In [320]:
TEXT = ttd.Field(
    sequential=True,
    batch_first=True,
    lower=True,
    tokenize='spacy',
    pad_first=True)
    #include_lengths = True)

LABEL = ttd.LabelField(dtype = torch.float)#, batch_first=True)

#Train dataset
Train_dataset = ttd.TabularDataset(
    path='train2.csv',
    format='csv',
    skip_header=True,
    fields=[('data', TEXT),('label', LABEL)]
)

#Test dataset
Test_dataset = ttd.TabularDataset(
    path='test2.csv',
    format='csv',
    skip_header=True,
    fields=[('data', TEXT)]
)

In [321]:
SEED=1234
training_dataset, testing_dataset = Train_dataset.split(split_ratio=0.7,random_state = random.seed(SEED)) # default is 0.7

In [322]:
import random
SEED=1234
training_dataset, valid_dataset = training_dataset.split(random_state = random.seed(SEED)) # default is 0.7

In [323]:
print(f'Number of training examples: {len(training_dataset)}')
print(f'Number of validation examples: {len(valid_dataset)}')
print(f'Number of testing examples: {len(testing_dataset)}')

Number of training examples: 4579
Number of validation examples: 1963
Number of testing examples: 2804


In [324]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(training_dataset, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)


In [325]:
#TEXT.build_vocab(training_dataset)
LABEL.build_vocab(training_dataset)

In [326]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter = ttd.BucketIterator.splits((training_dataset,valid_dataset), 
                              sort_key=lambda x: len(x.data),
                              #sort_key=None,
                              sort_within_batch = True,
                              batch_size=64, 
                              device=device)

In [327]:
test_iter = ttd.BucketIterator(testing_dataset, 
                              sort_key=lambda x: len(x.data),
                              #sort_key=None,
                              sort_within_batch = True,
                              batch_size=64, 
                              device=device)

#### Creating a CNN function

In [328]:
import torch.nn.functional as F
class CNN_model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [329]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,2,2]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model_CNN = CNN_model(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [330]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model_CNN):,} trainable parameters')

The model has 1,176,101 trainable parameters


In [331]:
pretrained_embeddings = TEXT.vocab.vectors

model_CNN.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.1318, -0.3035,  1.0498,  ..., -0.1555, -0.5055,  0.1247],
        [-0.4578,  0.8600,  0.8540,  ...,  3.0938,  0.9062,  0.7151],
        [-2.7232,  0.1180, -0.5694,  ..., -0.5942,  1.4725,  0.3334],
        ...,
        [ 0.6199, -0.3344,  0.4038,  ..., -0.7188, -1.0255,  2.5693],
        [-0.0091,  1.5333,  0.7273,  ...,  2.3677,  0.1274, -0.9346],
        [ 1.0803,  0.3010, -0.9273,  ..., -0.2327,  0.4826,  0.1647]])

In [332]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model_CNN.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model_CNN.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [333]:
import torch.optim as optim

optimizer = optim.Adam(model_CNN.parameters(),lr=0.001)
#optimizer = optim.SGD(model_CNN.parameters(), lr=1e-3)

criterion = nn.BCEWithLogitsLoss()

model_CNN = model_CNN.to(device)
criterion = criterion.to(device)

In [334]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [335]:
def train_func3(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.data).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [336]:
def evaluate3(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.data).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [337]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#### Training the Model

In [338]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_func3(model_CNN, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate3(model_CNN, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_CNN.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.623 | Train Acc: 66.86%
	 Val. Loss: 0.569 |  Val. Acc: 70.29%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.519 | Train Acc: 75.64%
	 Val. Loss: 0.535 |  Val. Acc: 74.80%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.459 | Train Acc: 79.41%
	 Val. Loss: 0.495 |  Val. Acc: 77.05%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.405 | Train Acc: 82.29%
	 Val. Loss: 0.490 |  Val. Acc: 77.12%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.356 | Train Acc: 84.46%
	 Val. Loss: 0.515 |  Val. Acc: 77.30%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.313 | Train Acc: 87.02%
	 Val. Loss: 0.509 |  Val. Acc: 76.64%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.258 | Train Acc: 89.29%
	 Val. Loss: 0.527 |  Val. Acc: 76.59%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.221 | Train Acc: 91.69%
	 Val. Loss: 0.550 |  Val. Acc: 76.64%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.180 | Train Acc: 93.61%
	 Val. Loss: 0.567 |  Val. Acc: 76.19%
Epoch: 10 | Epoch Time: 0m 0

In [339]:
model_CNN.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate3(model_CNN, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.475 | Test Acc: 78.42%


#### Making final predictions

In [340]:
import spacy
nlp = spacy.load('en')
model_CNN.load_state_dict(torch.load('tut4-model.pt'))

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [341]:
test_predictions = []
for text in test['text']:
  test_predictions.append(predict_sentiment(model_CNN,text))

In [342]:
print(test_predictions)

[0.465151846408844, 0.9686873555183411, 0.09507963806390762, 0.48898977041244507, 0.6569994688034058, 0.09669846296310425, 0.4130823016166687, 0.24770590662956238, 0.09016075730323792, 0.11028182506561279, 0.41365671157836914, 0.15479980409145355, 0.9080560207366943, 0.09659378975629807, 0.21090684831142426, 0.6286602020263672, 0.27938905358314514, 0.9763802289962769, 0.8842927813529968, 0.10443146526813507, 0.14120352268218994, 0.19857291877269745, 0.1382574588060379, 0.26858946681022644, 0.07057629525661469, 0.8856698870658875, 0.1269083172082901, 0.06816764920949936, 0.0811086893081665, 0.9583178162574768, 0.37876102328300476, 0.9146126508712769, 0.11199995130300522, 0.08443864434957504, 0.37352806329727173, 0.326129674911499, 0.5559449195861816, 0.21375715732574463, 0.0673597902059555, 0.5215111374855042, 0.2901114821434021, 0.1271006017923355, 0.11918247491121292, 0.6307768225669861, 0.24005091190338135, 0.44708436727523804, 0.3641147315502167, 0.6722557544708252, 0.05451918020844

In [343]:
test_predictions_rounded = [round(pred) for pred in test_predictions]

In [344]:
test = pd.read_csv("test.csv", encoding='ISO-8859-1')
test['Target'] = test_predictions_rounded
test.to_csv('Predictions_CNN.csv', index=False)

#### Freezing weight embeddings and training the model

In [345]:
N_EPOCHS = 10
FREEZING = 10

best_valid_loss = float('inf')
#freeze embeddings
model_CNN.embedding.weight.requires_grad = unfrozen = False

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_func3(model_CNN, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate3(model_CNN, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_CNN.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    if (epoch + 1) >= FREEZING:
        #unfreeze embeddings
        model_CNN.embedding.weight.requires_grad = unfrozen = True

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.363 | Train Acc: 84.13%
	 Val. Loss: 0.506 |  Val. Acc: 76.64%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.331 | Train Acc: 86.00%
	 Val. Loss: 0.510 |  Val. Acc: 76.77%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.325 | Train Acc: 86.09%
	 Val. Loss: 0.520 |  Val. Acc: 76.80%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.314 | Train Acc: 86.74%
	 Val. Loss: 0.519 |  Val. Acc: 76.62%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.285 | Train Acc: 88.26%
	 Val. Loss: 0.524 |  Val. Acc: 76.42%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.270 | Train Acc: 89.09%
	 Val. Loss: 0.536 |  Val. Acc: 75.66%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.255 | Train Acc: 89.06%
	 Val. Loss: 0.539 |  Val. Acc: 75.91%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.242 | Train Acc: 90.39%
	 Val. Loss: 0.552 |  Val. Acc: 76.67%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.231 | Train Acc: 90.64%
	 Val. Loss: 0.559 |  Val. Acc: 76.57%
Epoch: 10 | Epoch Time: 0m 0

In [346]:
model_CNN.load_state_dict(torch.load('tut5-model.pt'))

test_loss, test_acc = evaluate3(model_CNN, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.488 | Test Acc: 78.12%


#### Making final predictions

In [347]:
import spacy
nlp = spacy.load('en')
model_CNN.load_state_dict(torch.load('tut5-model.pt'))

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [348]:
test_predictions = []
for text in test['text']:
  test_predictions.append(predict_sentiment(model_CNN,text))

In [349]:
print(test_predictions)

[0.5283995270729065, 0.9746401906013489, 0.06333830207586288, 0.5737374424934387, 0.7506172060966492, 0.07446950674057007, 0.3572539985179901, 0.24838243424892426, 0.13165640830993652, 0.06532054394483566, 0.4001598358154297, 0.08954495191574097, 0.9209583401679993, 0.06060301885008812, 0.1508021056652069, 0.5595267415046692, 0.2201589196920395, 0.9784345626831055, 0.9113001227378845, 0.09412188082933426, 0.09148187935352325, 0.11780577152967453, 0.07467789202928543, 0.29299986362457275, 0.06749706715345383, 0.8941499590873718, 0.07728300243616104, 0.045582789927721024, 0.04353645071387291, 0.968582034111023, 0.33230239152908325, 0.9319546818733215, 0.08120282739400864, 0.041865747421979904, 0.30544084310531616, 0.2326241433620453, 0.5081981420516968, 0.14489521086215973, 0.07469717413187027, 0.493653804063797, 0.27219727635383606, 0.0871315598487854, 0.08838817477226257, 0.6908309459686279, 0.1719997376203537, 0.348044216632843, 0.28583860397338867, 0.5825009346008301, 0.0335493758320

In [351]:
test_predictions_rounded = [round(pred) for pred in test_predictions]

In [352]:
test = pd.read_csv("test.csv", encoding='ISO-8859-1')
test['Target'] = test_predictions_rounded
test.to_csv('Predictions_CNN_FW.csv', index=False)

## We observe that the initial CNN model gave us the best results on test dataset with 78.42% accuracy and hence we will choose that for making final predictions, which will be implemented in the final file.