# Section 0: Import section

In [1]:
# data science
import numpy as np

# helper functions
from helper_func_and_classes import TwitterDataset_BERT
from helper_func_and_classes import create_data_loader_BERT
from helper_func_and_classes import split_dataset
from helper_func_and_classes import create_dataset_list
from helper_func_and_classes import output_numpy_array_from_model_training


import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import random
from tqdm import tqdm


RANDOM_SEED = 123
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED);


# Section 1: Data preprocessing section
## Section 1.1: Creating lists of sentences

In [2]:
pos_data_full = create_dataset_list("./twitter-datasets/train_pos_full.txt")
pos_labels_full = [1]*len(pos_data_full)

neg_data_full = create_dataset_list("./twitter-datasets/train_neg_full.txt")
neg_labels_full = [0]*len(neg_data_full)

all_data_full = pos_data_full + neg_data_full
all_labels_full = pos_labels_full + neg_labels_full


pos_data_lite = create_dataset_list("./twitter-datasets/train_pos.txt")
pos_labels_lite = [1]*len(pos_data_lite)

neg_data_lite = create_dataset_list("./twitter-datasets/train_neg.txt")
neg_labels_lite = [0]*len(neg_data_lite)

all_data_lite = pos_data_lite + neg_data_lite
all_labels_lite = pos_labels_lite + neg_labels_lite


submission_data = create_dataset_list("./twitter-datasets/test_data.txt")


print("Length of all_data_lite: ", len(all_data_lite))
print("Length of all_labels_lite: ", len(all_labels_lite), "\n")
print("Length of all_data_full: ", len(all_data_full))
print("Length of all_labels_full: ", len(all_labels_full), "\n")


print("Length of submission_data: ",len(submission_data))

Length of all_data_lite:  200000
Length of all_labels_lite:  200000 

Length of all_data_full:  2500000
Length of all_labels_full:  2500000 

Length of submission_data:  10000


## Section 1.2: Sentence embeddings

In [3]:
# setup
max_length = 37
batch_size = 256
DATA_FULL = False

In [4]:
#PRETRAINED_MODEL_BERT = 'bert-base-cased'
PRETRAINED_MODEL_BERT = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_BERT)

In [5]:
tokens = tokenizer.tokenize(pos_data_lite[0])
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [6]:
encoding = tokenizer.encode_plus(
    pos_data_lite[0],
    max_length=37,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt'  # Return PyTorch tensors
)

encoding.keys()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


dict_keys(['input_ids', 'attention_mask'])

In [7]:
if not DATA_FULL:
    train_samples_lite, test_samples_lite, train_labels_lite, test_labels_lite = train_test_split(
        all_data_lite, 
        all_labels_lite, 
        test_size=0.1, 
        random_state=RANDOM_SEED)
    
if DATA_FULL:
    train_data_samples_full, test_data_samples_full, train_data_labels_full, test_data_labels_full = train_test_split(
        all_data_full, 
        all_labels_full, 
        test_size=0.1, 
        random_state=RANDOM_SEED)

In [8]:
if not DATA_FULL:
    train_loader_lite = create_data_loader_BERT(
        train_samples_lite,
        train_labels_lite,
        tokenizer, 
        max_length, 
        batch_size)

    test_loader_lite = create_data_loader_BERT(
        test_samples_lite,
        test_labels_lite,
        tokenizer, 
        max_length, 
        batch_size)

if DATA_FULL:
    train_loader_full = create_data_loader_BERT(
        train_samples_full,
        train_labels_full,
        tokenizer, 
        max_length, 
        batch_size)

    test_loader_full = create_data_loader_BERT(
        test_samples_full,
        test_labels_full,
        tokenizer, 
        max_length, 
        batch_size)

In [9]:
model_bert = BertModel.from_pretrained(PRETRAINED_MODEL_BERT, return_dict=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
last_hidden_state, pooled_output = model_bert(
    input_ids=encoding['input_ids'],
    attention_mask=encoding['attention_mask']
)




In [11]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRETRAINED_MODEL_BERT, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [12]:
model = SentimentClassifier(2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Parameters to tune:
Batch size: 16, 32  
Learning rate (Adam): 5e-5, 3e-5, 2e-5  
Number of epochs: 2, 3, 4  

In [13]:


EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader_lite) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss()

In [14]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in tqdm(data_loader):
        input_ids = d["input_ids"]
        attention_mask = d["attention_mask"]
        labels = d["labels"]
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, predicted = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)
        correct_predictions += torch.sum(predicted == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [15]:
def eval_model(model, data_loader, loss_fn, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"]
            attention_mask = d["attention_mask"]
            labels = d["labels"]
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, predicted = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)
            correct_predictions += torch.sum(predicted == labels)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [16]:
%%time

for epoch in range(EPOCHS):
    print("Epoch: " epoch)
    train_accuracy, loss = train_epoch(
        model,
        train_loader_lite,
        loss_fn,
        optimizer,
        scheduler,
        len(train_samples_lite)
      )

    print(f'Train loss {loss} accuracy {train_accuracy}')
    test_accuracy, test_loss = eval_model(
        model,
        test_loader_lite,
        loss_fn,
        len(test_samples_lite)
      )
    print(f'Test loss {test_loss} accuracy {test_accuracy}')
    print()



Epoch 1/3
----------
1


  1%|▎                          | 9/704 [09:40<12:26:58, 64.49s/it]


KeyboardInterrupt: 

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);