In [0]:
!pip install transformers

In [0]:
import transformers
import torch.nn as nn
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import os,json
import pandas as pd


In [0]:
token = {"username":"vinaybhupalam","key":"5c27e8b73e30502d96da3d5b41a37fed"}
if not os.path.exists('/root/.kaggle'):

  with open('/root/.kaggle/kaggle.json', 'w+') as file:
    
    json.dump(token, file)

In [0]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [0]:
!unzip imdb-dataset-of-50k-movie-reviews.zip 

Archive:  imdb-dataset-of-50k-movie-reviews.zip
replace IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [0]:
import transformers
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 10
MODEL_PATH = "model.bin"
TRAINING_FILE = "IMDB Dataset.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    "bert-base-uncased",
    do_lower_case=True
)

In [0]:
class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        x = torch.tensor(self.target[item], dtype=torch.float, device=torch.device("cuda"))

        return {
            'ids': torch.tensor(ids, dtype=torch.long, device=torch.device("cuda")),
            'mask': torch.tensor(mask, dtype=torch.long, device=torch.device("cuda")),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long, device=torch.device("cuda")),
            'targets' : torch.tensor(self.target[item], dtype=torch.float, device=torch.device("cuda"))
        }

In [0]:

class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(os.getcwd())
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
    
    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(o2)
        output = self.out(bo)        
        return output

In [0]:
print(torch.cuda.is_available())

True


In [0]:
from tqdm import tqdm

def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)


        optimizer.zero_grad()
        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()


def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)
            print("Eval fuction target datatype")
            print(targets.device)
            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

Read the dataset and create train and validation data loaders

In [0]:

dfx = pd.read_csv(TRAINING_FILE).fillna("none")
dfx.sentiment = dfx.sentiment.apply(
    lambda x: 1 if x == "positive" else 0
)

df_train, df_valid = model_selection.train_test_split(
    dfx[0:10000],
    test_size=0.1,
    random_state=42,
    stratify=dfx[0:10000].sentiment.values
)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)


train_dataset = BERTDataset(
    review=df_train.review.values,
    target=df_train.sentiment.values
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    num_workers=0
)

valid_dataset = BERTDataset(
    review=df_valid.review.values,
    target=df_valid.sentiment.values
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    num_workers=0
)

device = torch.device("cuda")
print(device)


In [0]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cd /content/drive/My\ Drive

/content/drive/My Drive


In [0]:
!ls

In [0]:
os.getcwd()

'/content/drive/My Drive'

Train the Model

In [0]:
model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)

#model = nn.DataParallel(model)
import numpy as np
best_accuracy = 0
for epoch in range(EPOCHS):
    print("Epoch {} " .format(epoch))
    train_fn(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets = eval_fn(valid_data_loader, model, device)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    print(f"Accuracy Score = {accuracy}")
    if accuracy > best_accuracy:
        torch.save(model.state_dict(), "model.bin")
        best_accuracy = accuracy
   

Evaluate the model with the Test Set of 5000 datapoints

In [0]:
df_test = dfx[10000:15000].reset_index(drop=True)
test_dataset = BERTDataset(
    review=df_test.review.values,
    target=df_test.sentiment.values
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    num_workers=0
)

In [0]:
MODEL = BERTBaseUncased()    
MODEL.load_state_dict(torch.load("model.bin"))

<All keys matched successfully>

In [0]:
outputs, targets = eval_fn(valid_data_loader, model, device)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)

In [0]:
print(f"Accuracy Score = {accuracy}")

Accuracy Score = 0.889


In [0]:
from sklearn.metrics import classification_report
print(classification_report(targets, outputs))

              precision    recall  f1-score   support

         0.0       0.97      0.80      0.88       497
         1.0       0.83      0.98      0.90       503

    accuracy                           0.89      1000
   macro avg       0.90      0.89      0.89      1000
weighted avg       0.90      0.89      0.89      1000

