In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Reference:

Many thanks to Venelin Valkov for the great tutorial ([1], [2], [3]): 

[1] https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

[2] https://www.youtube.com/watch?v=Osj0Z6rwJB4

[3] https://www.youtube.com/watch?v=8N-nM3QW7O0&t=733s

https://www.kaggle.com/swarnabha/pytorch-text-classification-torchtext-lstm

https://www.kaggle.com/shahules/basic-eda-cleaning-and-glove

https://www.kaggle.com/vishalsiram50/fine-tuning-bert-88-accuracy

https://www.analyticsvidhya.com/blog/2021/05/all-you-need-to-know-about-bert/#:~:text=%5BCLS%5D%20is%20a%20special%20classification,segment%20embeddings%20for%20each%20token.

In [None]:
!pip install transformers

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
%config InlineBackend.figure_format='retina'


In [None]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 10, 6
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
sns.countplot(train.target)
plt.xlabel('train target')

Bert Tokenizer: 
* Add special tokens to separate sentences and do classification
* Pass sequences of constant length (introduce padding)
* Create array of 0s (pad token) and 1s (real token) called attention mask

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

Special Tokens:

[SEP] - marker for ending of a sentence

[CLS] - we must add this token to the start of each sentence, so BERT knows we’re doing classification

[PAD] - There is also a special token for padding

BERT understands tokens that were in the training set. Everything else can be encoded using the [UNK] (unknown) token

![bert-tokens.PNG](attachment:c30e8f11-f363-4841-8858-bff8efc1d790.PNG)

(https://www.analyticsvidhya.com/blog/2021/05/all-you-need-to-know-about-bert/#:~:text=%5BCLS%5D%20is%20a%20special%20classification,segment%20embeddings%20for%20each%20token)

In [None]:
print(tokenizer.sep_token, tokenizer.sep_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.unk_token, tokenizer.unk_token_id)

In [None]:
# All of that work can be done using the encode_plus() method
encoding = tokenizer.encode_plus(
  text=sample_txt,
  max_length=32,           # max length of sentence 
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  padding='max_length',
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
encoding.keys()

In [None]:
# The token ids are now stored in a Tensor and padded to a length of 32
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
# The attention mask has the same length:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
# We can inverse the tokenization to have a look at the special tokens
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))

In [None]:
# Choosing max Sequence Length for our data set 
# BERT works with fixed-length sequences. We’ll use a simple strategy to choose the max length.
all_tweets = np.concatenate([train.text.values, test.text.values])

token_lens = []
for txt in all_tweets:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
# plot the distribution
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count')

In [None]:
MAX_LEN = max(token_lens)
MAX_LEN

Create PyTorch dataset

In [None]:
class TweetDataset(Dataset):
  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
    
  def __len__(self):
    return len(self.text)


  def __getitem__(self, item):
    text = str(self.text[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    
    
    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
df_train, df_val = train_test_split(
  train,
  test_size=0.1,
  random_state=RANDOM_SEED
)

df_test = test
df_train.shape, df_val.shape, df_test.shape

In [None]:
BATCH_SIZE = 16

In [None]:
def create_data_loader_train(df, tokenizer, max_len, batch_size):
  dataset = TweetDataset(
    text=df.text.to_numpy(),
    targets=df.target.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    dataset,
    batch_size=batch_size,
    num_workers=2
  )

In [None]:
class TweetDatasetTest(Dataset):
  def __init__(self, text, tokenizer, max_len):
    self.text = text
    self.tokenizer = tokenizer
    self.max_len = max_len
    
  def __len__(self):
    return len(self.text)


  def __getitem__(self, item):
    text = str(self.text[item])
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt'
    )
    
    
    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten()
    }

In [None]:
def create_data_loader_test(df, tokenizer, max_len, batch_size):
  dataset = TweetDatasetTest(
    text=df.text.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    dataset,
    batch_size=batch_size,
    num_workers=2
  )

In [None]:
CUSTOM_MAX_LEN = 60
train_data_loader = create_data_loader_train(df_train, tokenizer, CUSTOM_MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader_train(df_val, tokenizer, CUSTOM_MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader_test(df_test, tokenizer, CUSTOM_MAX_LEN, BATCH_SIZE)

In [None]:
import warnings
warnings.filterwarnings("ignore")
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

Bert Model

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
# return last state of encoder and pooled output 
# The last_hidden_state is a sequence of hidden states of the last layer of the model. 
# Obtaining the pooled_output is done by applying the BertPooler on last_hidden_state

# We have the hidden state for each of our 32 tokens (the length of our example sequence).
# 768 is the number of hidden units in the feedforward-networks

# https://huggingface.co/docs/transformers/model_doc/bert#bertmodel

# https://stackoverflow.com/questions/65132144/bertmodel-transformers-outputs-string-instead-of-tensor
output = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask']
)

print(type(output))
print(output.keys())
print(bert_model.config.hidden_size)


In [None]:
print(output['last_hidden_state'].shape)

In [None]:
print(output['pooler_output'].shape)

Build Sentiment Classifier

In [None]:
import torch
import torch.nn as nn

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes, p):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p)
    # torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)

    
  def forward(self, input_ids, attention_mask):
    output_dict = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(output_dict['pooler_output'])
    output = self.out(output)
    return self.softmax(output)

In [None]:
N_CLASSES = 2
P = 0.2
model = SentimentClassifier(N_CLASSES, P)
model = model.to(device)

In [None]:
# data = next(iter(train_data_loader))
print(len(data['text']))
# data is consisted of 16 tweets 

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

In [None]:
print(input_ids.shape)
print(attention_mask.shape)

In [None]:
# inject input ids and attention mask into Sentiment Classifier 
# this returns the probablity for 0 or 1 for each tweet in data 
model(input_ids, attention_mask)

Train Model 

To reproduce the training procedure from the BERT paper, we’ll use the AdamW optimizer provided by Hugging Face. 

It corrects weight decay, so it’s similar to the original paper. We’ll also use a linear scheduler with no warmup steps

In [None]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

How do we come up with all hyperparameters? The BERT authors have some recommendations for fine-tuning:

* Batch size: 16, 32
* Learning rate (Adam): 5e-5, 3e-5, 2e-5
* Number of epochs: 2, 3, 4

In [None]:
def train_epoch(
      model,
      data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
      n_examples
):
      model = model.train()
      losses = []
      correct_predictions = 0
        
      for data in data_loader:
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            targets = data["targets"].to(device)

            # inject inputs ids and attention mask into bert model 
            outputs = model(
              input_ids=input_ids,
              attention_mask=attention_mask
            )

            # return prediction 
            _, preds = torch.max(outputs, dim=1)

            # calculate loss 
            loss = loss_fn(outputs, targets)

            # calculate correct prediction 
            correct_predictions += torch.sum(preds == targets)
            
            # append each loss
            losses.append(loss.item())
            
            # back propagation 
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # updata parameters 
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            
      return correct_predictions.double() / n_examples, np.mean(losses)

Training the model should look familiar, except for two things.

The scheduler gets called every time a batch is fed to the model. 

We’re avoiding exploding gradients by clipping the gradients of the model using clipgrad_norm.

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for data in data_loader:
      input_ids = data["input_ids"].to(device)
      attention_mask = data["attention_mask"].to(device)
      targets = data["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
        
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
        
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    
    train_acc, train_loss = train_epoch(
                            model,
                            train_data_loader,
                            loss_fn,
                            optimizer,
                            device,
                            scheduler,
                            len(df_train)
                        )
    
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(
                            model,
                            val_data_loader,
                            loss_fn,
                            device,
                            len(df_val)
                        )
    
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])

Predictions

In [None]:
test_data = next(iter(test_data_loader))
print(test_data['text'])

In [None]:
print(test_data['input_ids'])

In [None]:
print(test_data['attention_mask'])

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  tweets = []
  predictions = []
  prediction_probs = []
  
  with torch.no_grad():
    for data in data_loader:
      texts = data["text"]
      input_ids = data["input_ids"].to(device)
      attention_mask = data["attention_mask"].to(device)
      
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
    
      _, preds = torch.max(outputs, dim=1)
      tweets.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  
  return tweets, predictions, prediction_probs

In [None]:
_, y_pred, _ = get_predictions(model, test_data_loader)

In [None]:
print(len(y_pred.tolist()))

In [None]:
submission['target'] = y_pred.tolist()
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)