In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 41.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [2]:
import transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics

In [3]:
class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case =True
                                                       )

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(review,
                                            None,
                                            add_special_tokens=True,
                                            max_length=512,
                                            pad_to_max_length=True
                                            )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        padding_length = 512 - len(ids)
        ids = ids +([0]*padding_length)
        mask = mask + ([0]*padding_length)
        token_type_ids = token_type_ids + ([0]* padding_length)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)
            }

In [4]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.model = transformers.BertModel.from_pretrained("bert-base-uncased", return_dict=False)
        self.model_drop = nn.Dropout(0.3)
        self.linear = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        out_1, out_2 = self.model(ids,
                                  attention_mask=mask,
                                  token_type_ids=token_type_ids
                                  )
        return self.linear(self.model_drop(out_2))

In [5]:
df = pd.read_csv('/data/IMDB Dataset.csv').fillna("none")
df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)

df_train, df_valid = model_selection.train_test_split(df, test_size = 0.1,\
                                                      random_state = 42,\
                                                      stratify = df.sentiment.values
                                                      )
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)


train_dataset = BERTDataset(review = df_train.review.values,\
                            target = df_train.sentiment.values
                            )
train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers = 2)

valid_dataset = BERTDataset(review = df_valid.review.values,\
                            target = df_valid.sentiment.values
                            )
valid_dataloader = DataLoader(valid_dataset, batch_size=4, num_workers = 1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / 8 * 10)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, 
                                            num_training_steps=num_train_steps
                                            )

best_accuracy = 0
for epoch in range(2):
  model.train()
  for bi, d in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
    ids = d["ids"]
    token_type_ids = d["token_type_ids"]
    mask = d["mask"]
    targets = d["targets"]
    ids = ids.to(device, dtype = torch.long)
    token_type_ids = token_type_ids.to(device, dtype = torch.long)
    mask = mask.to(device, dtype = torch.long)
    targets = targets.to(device, dtype=torch.float)
    optimizer.zero_grad()
    outputs = model(ids = ids,\
                    mask = mask,\
                    token_type_ids = token_type_ids
                    )
    loss = nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
    loss.backward()
    optimizer.step()
    scheduler.step()
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for bi, d in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
      ids = d["ids"]
      token_type_ids = d["token_type_ids"]
      mask = d["mask"]
      targets = d["targets"]
      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)
      outputs = model(ids=ids,\
                      mask=mask,\
                      token_type_ids=token_type_ids
                      )
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
  outputs = np.array(fin_outputs) >= 0.5
  accuracy = metrics.accuracy_score(fin_targets, outputs)
  print(f"Accuracy Score = {accuracy}")
  if accuracy > best_accuracy:
    torch.save(model.state_dict(), "model.bin")
    best_accuracy = accuracy

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

Accuracy Score = 0.9374


  0%|          | 0/5625 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 5625/5625 [1:13:23<00:00,  1.28it/s]
  0%|          | 0/1250 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to 

Accuracy Score = 0.937



