### Training Process

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import re

In [2]:
def seq_processing(s):
    s = s.lower()
    s = re.sub(r'(@.?)[\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'&amp', '&', s)
    return s

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics import classification_report, accuracy_score, f1_score
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from datasets import load_dataset

# Define constants
MAX_SEQ_LEN = 512
BATCH_SIZE = 16
NUM_CLASSES = 1
LEARNING_RATE = 1e-5
NUM_EPOCHS = 3

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

# Load the IMDb dataset
dataset = load_dataset('imdb')

# Split the dataset into train and test sets
train_dataset, test_dataset = dataset['train'], dataset['test']
train_texts, train_labels = train_dataset['text'], train_dataset['label']
test_texts, test_labels = test_dataset['text'], test_dataset['label']

# Create the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the IMDbDataset class
class IMDbDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=MAX_SEQ_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
#         label = torch.tensor(label)
        return input_ids, attention_mask, label

# Create the dataset objects
train_dataset = IMDbDataset(train_texts, train_labels, tokenizer)
test_dataset = IMDbDataset(test_texts, test_labels, tokenizer)

# Create the data loaders
def collate_fn(batch):
    input_ids_list, attention_mask_list, labels = zip(*batch)
    input_ids_padded = pad_sequence(input_ids_list, batch_first=True)
    attention_mask_padded = pad_sequence(attention_mask_list, batch_first=True)
    labels_tensor = torch.tensor(labels).to(device)
    return input_ids_padded, attention_mask_padded, labels_tensor

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Define the IMDbClassifier model
class IMDbClassifier(nn.Module):
    def __init__(self):
        super(IMDbClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.bn = nn.BatchNorm1d(self.bert.config.hidden_size)
        self.fc = nn.Linear(self.bert.config.hidden_size, NUM_CLASSES)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=False)
        output = self.dropout(pooled_output['pooler_output'])
        output = self.bn(output)
        output = self.fc(output)
        output = self.sigmoid(output)
        return output

# Create the model
model = IMDbClassifier().to(device)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
model.train()
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    for input_ids, attention_mask, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}'):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = model.criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    epoch_loss = running_loss / len(train_loader)
    print(f'Training Loss: {epoch_loss:.4f}')

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1/3: 100%|██████████| 1563/1563 [25:23<00:00,  1.03it/s]


Training Loss: 0.2597


Epoch 2/3: 100%|██████████| 1563/1563 [25:21<00:00,  1.03it/s]


Training Loss: 0.1568


Epoch 3/3: 100%|██████████| 1563/1563 [25:19<00:00,  1.03it/s]

Training Loss: 0.0966





In [4]:
# Evaluation loop
model.eval()
correct = 0
total = 0
real, pred = torch.Tensor([]), torch.Tensor([])

with torch.no_grad():
    for input_ids, attention_mask, labels in tqdm(
        test_loader, desc="Evaluating"
    ):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask)
        predicted = torch.round(outputs).squeeze()

        real = torch.cat([real, labels.detach().cpu()], dim = 0)
        pred = torch.cat([pred, predicted.detach().cpu()], dim = 0)

accuracy = accuracy_score(real, pred)
f1 = f1_score(real, pred)

print(classification_report(real, pred))
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

Evaluating: 100%|██████████| 1563/1563 [10:53<00:00,  2.39it/s]

              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94     12500
         1.0       0.94      0.94      0.94     12500

    accuracy                           0.94     25000
   macro avg       0.94      0.94      0.94     25000
weighted avg       0.94      0.94      0.94     25000

Test Accuracy: 0.9399
Test F1 Score: 0.9398





In [5]:
torch.save(model.state_dict(), 'bert-weight.pt')

### Evaluating Process

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from datasets import load_dataset
import re

# Define constants
MAX_SEQ_LEN = 512
BATCH_SIZE = 16
NUM_CLASSES = 1
LEARNING_RATE = 1e-5
NUM_EPOCHS = 5

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

# Create the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def seq_processing(s):
    s = s.lower()
    s = re.sub(r'(@.?)[\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'&amp', '&', s)
    return s

# Define the IMDbClassifier model
class IMDbClassifier(nn.Module):
    def __init__(self):
        super(IMDbClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.bn = nn.BatchNorm1d(self.bert.config.hidden_size)
        self.fc = nn.Linear(self.bert.config.hidden_size, NUM_CLASSES)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=False)
        output = self.dropout(pooled_output['pooler_output'])
        output = self.bn(output)
        output = self.fc(output)
        output = self.sigmoid(output)
        return output

#########    

sentence = seq_processing('This movie sucks.')
encoding = tokenizer.encode_plus(
    sentence,
    max_length=MAX_SEQ_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

class_names = ['Negative', 'Positive']

######### 

# Create the model
model = IMDbClassifier().to(device)
model.load_state_dict(torch.load('bert-weight.pt'))

model.eval()

with torch.no_grad():
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    output = model(input_ids, attention_mask).squeeze()
    print(f'The predicted sentence: {sentence}')
    print(f'Output: {class_names[round(float(output))]}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The predicted sentence: this movie sucks.
Output: Negative
