In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### importing necessaries libraries...

In [None]:
import sys
import numpy as np
import random as rn
import pandas as pd
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
# from torchnlp.datasets import imdb_dataset      # --> We are using our own uploaded dataset.
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from torch.nn import functional as F
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

### Initializing seed values to stabilize the outcomes.

In [None]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

## Prepare the data

In [None]:
path = '../input/imdb-50k-movie-reviews-test-your-bert/'

train_Data = pd.read_csv(path + 'train.csv')
test_Data = pd.read_csv(path + 'test.csv')

In [None]:
# experimenting here with a sample of dataset, to avoid memory overflow.
train_data = train_Data[:2000]
test_data = test_Data[:500]
val_data = train_Data[:-500]

train_data = train_data.to_dict(orient='records')
test_data = test_data.to_dict(orient='records')
val_data = val_data.to_dict(orient='records')

### Mapping sentences with their Labels...

In [None]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))
val_texts, val_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), val_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

#### visualizing one of the sentences from train set

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))
val_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], val_texts))

len(train_tokens), len(test_tokens), len(val_tokens)

In [None]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
val_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, val_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

In [None]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
val_y = np.array(val_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

### Now Masking few random IDs from each sentences to remove Biasness from model.

In [None]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
val_masks = [[float(i > 0) for i in ii] for ii in val_tokens_ids]

# Baseline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [None]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)

In [None]:
baseline_predicted = baseline_model.predict(test_texts)

In [None]:
print(classification_report(test_labels, baseline_predicted))

#### Our baseline model is working just fine and yeilding a fair enough score. Now, its time to play Dirty with the "BERT".

# BERT Model


### Bidirectional Encoder Representations from Transformers. Each word here has a meaning to it and we will encounter that one by one in this article. For now, the key takeaway from this line is – **BERT is based on the Transformer architecture**.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout1 = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768, 1)
        self.dropout2 = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output1 = self.dropout1(pooled_output)
        linear_output1 = F.relu(self.linear1(dropout_output1))
        dropout_output2 = self.dropout2(linear_output1)
        proba = self.sigmoid(dropout_output2)
        return proba


In [None]:
!pip install torchviz
from torchviz import make_dot



In [None]:
torch.cuda.empty_cache()
bert_clf = BertBinaryClassifier(0.1)
bert_clf = bert_clf.cuda()     # running BERT on CUDA_GPU

In [None]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
make_dot(y, params=dict(list(bert_clf.named_parameters()))).render("bert-class")
x.shape, y.shape, pooled.shape

In [None]:
y = bert_clf(x)
y.cpu().detach().numpy()        # kinda Garbage Collector to free up used and cache space

In [None]:
# # Cross- checking CUDA GPU Memory to ensure GPU memory is not overflowing.
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()     # Clearing Cache space for fresh Model run
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

# Fine Tune BERT

In [None]:
# Setting hyper-parameters

BATCH_SIZE = 8
EPOCHS = 10
lookback = np.ceil(0.05*EPOCHS)

In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

val_tokens_tensor = torch.tensor(val_tokens_ids)
val_y_tensor = torch.tensor(val_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)
val_masks_tensor = torch.tensor(val_masks)

torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

val_dataset = TensorDataset(val_tokens_tensor, val_masks_tensor, val_y_tensor)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=BATCH_SIZE)

In [None]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [None]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [None]:
torch.cuda.empty_cache()
a = torch.zeros(300000000, dtype=torch.int8)
a = a.cuda()
del a
torch.cuda.empty_cache()
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')

In [None]:
# Clearing Cache space for a fresh Model run
torch.cuda.empty_cache() 

def set_new_dropout(model,training_losses,validation_losses):
    print(training_losses)
    print(validation_losses)
    avg_val = np.mean([loss for loss in validation_losses[-5:]])
    avg_train = np.mean([loss for loss in training_losses[-5:]])
    loss_diff = abs(avg_train-avg_val)
    percentage_diff = loss_diff/avg_train
    new_p = percentage_diff
    if new_p>0.5:
        new_p = 0.5
    for name,child in model.named_children():
            if isinstance(child, torch.nn.Dropout) and name == 'dropout2':
                child.p = new_p

        
def run_bert_classifier(policy):
    training_losses = []
    validation_losses = []
    dropouts  = []
    BATCH_SIZE = 8
    EPOCHS = 10
    best_val_loss = 1000
    for layer in bert_clf.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()
            
    for epoch_num in range(EPOCHS):
        print('Epoch: ', epoch_num + 1)
        bert_clf.train()

        if policy and epoch_num!=0 and not (epoch_num+2)%lookback:
            new_p = set_new_dropout(bert_clf,training_losses,validation_losses)
        
        for name,child in bert_clf.named_children():
            if isinstance(child, torch.nn.Dropout) and name == 'dropout2':
                if epoch_num == 0:
                    child.p = 0.1
                rate = child.p
                print(f"rate {rate}")
    
        dropouts.append(rate)
        
        train_loss = 0
        for step_num, batch_data in enumerate(train_dataloader):
            token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
            #print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
            logits = bert_clf(token_ids, masks)

            loss_func = nn.BCELoss()

            batch_loss = loss_func(logits, labels)
            train_loss += batch_loss.item()


            bert_clf.zero_grad()
            batch_loss.backward()


            clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
            optimizer.step()
        
        train_loss =  train_loss / (step_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE,train_loss))
        
        bert_clf.eval()
        val_loss = 0
        with torch.no_grad():
            for step_num, batch_data in enumerate(test_dataloader):

                token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

                logits = bert_clf(token_ids, masks)
                loss_func = nn.BCELoss()
                loss = loss_func(logits, labels)
                val_loss += loss.item()
                
        val_loss = val_loss/(step_num+1)
        if val_loss <= best_val_loss:
            best_val_loss = val_loss
            torch.save(bert_clf,'best_model_sofar')
        print("\r" + "{0}/{1} validation loss: {2} ".format(step_num, len(test_data) / BATCH_SIZE, val_loss))
        training_losses.append(train_loss)
        validation_losses.append(val_loss)
        
    best_model = torch.load('best_model_sofar')
    return best_model,training_losses,validation_losses,dropouts

In [None]:
policy_model,policy_train,policy_val,varying_dropouts = run_bert_classifier(True)
model,train,val,_ = run_bert_classifier(False)

In [None]:
import matplotlib.pyplot as plt

epochs = 10
fig1 = plt.figure()
plt.title("Loss vs epochs without dynamic dropout")
plt.ylim(0,1)
plt.plot(range(epochs),train,label = 'Train loss')
plt.plot(range(epochs),val, label = 'Validation loss')
plt.legend()
plt.savefig('nopolicy')
fig1.show()

fig2 = plt.figure()
plt.title("Loss vs epochs with dynamic dropout")
plt.ylim(0,1)
plt.plot(range(epochs),policy_train,label = 'Train loss')
plt.plot(range(epochs),policy_val, label = 'Validation loss')
plt.legend()
plt.savefig('policy')
fig2.show()


fig3 = plt.figure()
plt.ylim(0,0.5)
plt.plot(range(epochs),varying_dropouts,color = 'r')
plt.savefig('dropouts')
fig3.show()

In [None]:
def evaluate(test_model):
    test_model.eval()
    bert_predicted = []
    all_logits = []
    with torch.no_grad():
        for step_num, batch_data in enumerate(test_dataloader):

            token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

            logits = test_model(token_ids, masks)
            loss_func = nn.BCELoss()
            loss = loss_func(logits, labels)
            numpy_logits = logits.cpu().detach().numpy()

            bert_predicted += list(numpy_logits[:, 0] > 0.5)
            all_logits += list(numpy_logits[:, 0])
    np.mean(bert_predicted)
    print(classification_report(test_y, bert_predicted))


In [None]:
evaluate(model)

In [None]:
evaluate(policy_model)