# Download dataset from kaggle

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/My Drive/kaggle"
%cd /content/drive/My Drive/colab_data/datasets
if not os.path.isdir('sentiment-analysis-on-movie-reviews'):
  os.mkdir('sentiment-analysis-on-movie-reviews')
%cd sentiment-analysis-on-movie-reviews
!kaggle competitions download -c sentiment-analysis-on-movie-reviews
!ls

# Setup

In [None]:
!pip install -qq transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pandas as pd
pd.set_option('display.max_colwidth', None)

ROOT_DIR = '/content/drive/My Drive/colab_data/datasets/sentiment-analysis-on-movie-reviews'
LABELS = ['negative','somewhat negative', 'neutral', 'somewhat positive', 'positive']

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR, 'train.tsv.zip'), sep="\t")
df.head()

# Preprocessing

In [None]:
import transformers
from transformers import BertModel, BertTokenizer

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import pprint

PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [None]:
# adjust plot colors if notebook background is dark
default_color = 'white'
mpl.rcParams['axes.labelcolor'] = default_color
mpl.rcParams['xtick.color'] = default_color
mpl.rcParams['ytick.color'] = default_color
mpl.rcParams['axes.titlecolor'] = default_color

Ref for pretrained models: https://huggingface.co/transformers/pretrained_models.html?highlight=pretrained%20names

In [None]:
sample_reviews = list(df['Phrase'][:5].values)
for text in sample_reviews:
  print(text)

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
tokenizer.max_model_input_sizes

In [None]:
encoded_inputs = tokenizer(sample_reviews)
pprint.pprint(encoded_inputs)

In [None]:
for ids in encoded_inputs['input_ids']:
  print(tokenizer.decode(ids))

In [None]:
encoded_inputs = tokenizer(
                      sample_reviews,
                      max_length=64,
                      add_special_tokens=True,
                      return_token_type_ids=False,
                      pad_to_max_length=True,
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt'
                    )
for ids in encoded_inputs['input_ids']:
  print(tokenizer.decode(ids))

In [None]:
token_lens = []
for txt in df['Phrase']:
  tokens = tokenizer.encode(txt, max_length=512, truncation=True)
  token_lens.append(len(tokens))

In [None]:
print(max(token_lens))
sns.distplot(token_lens)
plt.xlim([0, 100]);
plt.xlabel('Token count');

In [None]:
MAX_LEN = 64
BATCH_SIZE = 100

In [None]:
class ReviewDataset(Dataset):
  def __init__(self, df, tokenizer, max_len):
    self.input = df['Phrase'].to_numpy();
    self.output = df['Sentiment'].to_numpy();
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.input)

  def __getitem__(self, idx):
    review = str(self.input[idx])
    label = self.output[idx]
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt')

    return {
      'input_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'label': torch.tensor(label, dtype=torch.long)}

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2)
print(train_df.shape)
print(val_df.shape)

In [None]:
train_dataset = ReviewDataset(
    train_df,
    tokenizer=tokenizer,
    max_len=MAX_LEN)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=4)

In [None]:
val_dataset = ReviewDataset(
    val_df,
    tokenizer=tokenizer,
    max_len=MAX_LEN)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=4)

In [None]:
nb_train_batchs = len(train_dataloader)
nb_val_batchs = len(val_dataloader)

# BERT Model: Fine-Tuning SequenceClassification

ref: https://huggingface.co/transformers/training.html#pytorch

In [None]:
from torch import nn
import torch.nn.functional as F

from transformers import BertForSequenceClassification

EPOCHS = 1

In [None]:
use_cuda = torch.cuda.is_available()
print (use_cuda)

if use_cuda:
  current_device = torch.cuda.current_device()
  print(torch.cuda.get_device_name(current_device))
else:
  current_device = torch.device("cpu")

def print_(loss, mode="Training"):
  print (mode + ":The loss calculated: ", loss)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
model.to(current_device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for i in range(EPOCHS):
  print("Epoch:",i+1)
  model = model.train()
  running_loss = 0.0;
  for i_batch, sample_batch in enumerate(train_dataloader):
      input_ids = sample_batch['input_ids']
      attention_masks = sample_batch['attention_mask']
      labels = sample_batch['label']

      input_ids = input_ids.to(current_device)
      attention_masks = attention_masks.to(current_device)
      labels = labels.to(current_device)

      optimizer.zero_grad()
      _y = model(input_ids, attention_mask=attention_masks)
      loss = loss_fn(_y[0], labels)
      loss.backward()
      optimizer.step()
      
      running_loss += loss.item()
  print_(running_loss/nb_train_batchs)


  model = model.eval()
  running_loss = 0.0;
  for i_batch, sample_batch in enumerate(val_dataloader):
      input_ids = sample_batch['input_ids']
      attention_masks = sample_batch['attention_mask']
      labels = sample_batch['label']

      input_ids = input_ids.to(current_device)
      attention_masks = attention_masks.to(current_device)
      labels = labels.to(current_device)

      _y = model(input_ids, attention_mask=attention_masks)
      loss = loss_fn(_y[0], labels)

      running_loss += loss.item()
  print_(running_loss/nb_train_batchs, "Evaluation")

# BERT Model: Fine-Tuning Custom Model

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes, bert_model):
      super(SentimentClassifier, self).__init__()
      self.bert = bert_model
      self.drop = nn.Dropout(p=0.1)
      self.mlc_1 = nn.Linear(self.bert.config.hidden_size, 100)
      self.mlc_2 = nn.Linear(100, n_classes)

    def forward(self, input_ids, attention_mask):
      _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask)
      _y = self.drop(pooled_output)
      _y = self.mlc_1(_y)
      _y = F.softmax(self.mlc_2(_y), dim=0)
      return _y

In [None]:
ber_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
model = SentimentClassifier(5, ber_model)
model.to(current_device)

In [None]:
model = model.train()
running_loss = 0.0;
for i_batch, sample_batch in enumerate(train_dataloader):
  input_ids = sample_batch['input_ids']
  attention_masks = sample_batch['attention_mask']
  labels = sample_batch['label']

  input_ids = input_ids.to(current_device)
  attention_masks = attention_masks.to(current_device)
  labels = labels.to(current_device)

  optimizer.zero_grad()
  _y = model(input_ids, attention_mask=attention_masks)
  loss = loss_fn(_y, labels)
  loss.backward()
  optimizer.step()
  
  running_loss += loss.item()
print(running_loss/nb_train_batchs)

# Exercises:


1.   Use Trainer Module from transformer package (ref: https://huggingface.co/transformers/training.html#trainer) 

