In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch


df = pd.read_csv(r'C:\Users\Rohan\Downloads\NLP MIni\IMDB Dataset.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')


X = df['review'].values
y = df['sentiment'].values



encoded_data = tokenizer.batch_encode_plus(
    X,
    add_special_tokens=True,
    max_length=128,
    return_attention_mask=True,
    pad_to_max_length=True,
    return_tensors='pt',
    truncation=True
)

input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']


X_train, X_test, y_train, y_test, train_masks, test_masks = train_test_split(
    input_ids, y, attention_masks,
    test_size=0.2, random_state=0
)

y_train = [1 if label == 'positive' else 0 for label in y_train]
y_test = [1 if label == 'positive' else 0 for label in y_test]


train_data = TensorDataset(X_train, train_masks, torch.tensor(y_train, dtype=torch.long))
test_data = TensorDataset(X_test, test_masks, torch.tensor(y_test, dtype=torch.long))



train_data = TensorDataset(X_train, train_masks, torch.tensor(y_train))
test_data = TensorDataset(X_test, test_masks, torch.tensor(y_test))

batch_size = 32
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)


optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()


epochs = 5  


model.train()
for epoch in range(epochs):
    for batch in train_dataloader:
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

model.eval()
accuracy = 0
total = 0

with torch.no_grad(): 
    for batch in test_dataloader:
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        accuracy += (predicted == labels).sum().item()


print(f'Accuracy: {accuracy / total * 100:.2f}%')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [14]:
print(f'Accuracy: {accuracy / total * 100:.2f}%')

Accuracy: 93.50%


In [18]:

output_dir = "./model_bert" 


model.save_pretrained(output_dir)


tokenizer.save_pretrained(output_dir)


model.config.save_pretrained(output_dir)
