In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 15.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 46.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 70.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [5]:
import pandas as pd
import torch
import random
import numpy as np
from torch import nn
from tqdm import tqdm
from torch.optim import Adam
from transformers import BertModel
from transformers import BertTokenizer

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [6]:
import pandas as pd
datapath = 'drive/MyDrive/IMDB Dataset.csv'
df = pd.read_csv(datapath)

In [7]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'negative':0,
          'positive':1
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['sentiment']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']

train_count = round(0.7 * (df.shape[0] / 2)) * 2
train_percentage = train_count / df.shape[0]

X_train, X, y_train, y = train_test_split(X,y, stratify=y, train_size=train_percentage, random_state=SEED)

# train here refers to test set
train_count = round((2/3) * (df.shape[0] / 2)) * 2
train_percentage = train_count / df.shape[0]

X_test, X_val, y_test, y_val = train_test_split(X,y, stratify=y, train_size=train_percentage, random_state=SEED)


In [9]:
df_train = pd.concat([X_train, y_train], axis=1, ignore_index=False)
df_val = pd.concat([X_val, y_val], axis=1, ignore_index=False)
df_test = pd.concat([X_test, y_test], axis=1, ignore_index=False)

In [10]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        out = self.linear_relu_stack(dropout_output)
        out = torch.sigmoid(out)

        return out

In [11]:
loss_fn = nn.BCELoss()

In [12]:
from torch.optim import Adam
from tqdm import tqdm

batch_s = 8

def train(model, train_data, val_data, learning_rate, START_EPOCH, MAX_EPOCHS, loss_fn, acc_diff, path):
    torch.manual_seed(SEED)

    def seed_worker(worker_id):
        np.random.seed(SEED)
        random.seed(SEED)

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_s, shuffle=True, worker_init_fn=seed_worker,)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_s, worker_init_fn=seed_worker,)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = loss_fn
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    
    prev_acc = 0
    for epoch_num in range(START_EPOCH, MAX_EPOCHS+1):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                output = torch.flatten(output)

                batch_loss = criterion(output, train_label.to(torch.float32))
                total_loss_train += batch_loss.item()
                
                output = torch.round(output)
                acc = (output == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)
                    output = torch.flatten(output)

                    batch_loss = criterion(output, val_label.to(torch.float32))
                    total_loss_val += batch_loss.item()
                        
                    output = torch.round(output)
                    acc = (output == val_label).sum().item()
                    total_acc_val += acc

            results = f'Epochs: {epoch_num} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}'
            print(results)
            with open(f"{path}{learning_rate}_epoch{epoch_num}.txt", "w") as text_file:
                text_file.write(results)
            
            torch.save(model,f'{path}{learning_rate}_epoch{epoch_num}')

            if (total_acc_val / len(val_data)) - prev_acc <= acc_diff:
                return (total_loss_val / len(val_data), total_acc_val / len(val_data))
            
            prev_acc = total_acc_val / len(val_data)

In [16]:
START_EPOCH = 3 # inclusive
MAX_EPOCHS = 5 # inclusive
acc_diff = 0.00 # 0.5%
path = '/content/drive/MyDrive/models/no_preprocessing/'

model = BertClassifier()


for LR in [1e-5]:
    train(model, df_train, df_val, LR, START_EPOCH, MAX_EPOCHS, loss_fn, acc_diff, path)


100%|██████████| 4375/4375 [51:58<00:00,  1.40it/s]


Epochs: 3 | Train Loss:  0.005                 | Train Accuracy:  0.988                 | Val Loss:  0.025                 | Val Accuracy:  0.938


100%|██████████| 4375/4375 [51:59<00:00,  1.40it/s]


Epochs: 4 | Train Loss:  0.004                 | Train Accuracy:  0.993                 | Val Loss:  0.025                 | Val Accuracy:  0.935
