SENTIMENT ANALYSIS

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import re

In [17]:
df = pd.read_csv('Dataset/IMDB_Dataset.csv')
def preprocess(text):
    text = text.lower()
    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text
df['cleaned_text'] = df['review'].apply(preprocess)
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,review,sentiment,cleaned_text
0,One of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love in the time of money is a ...


In [18]:
maximum_length = 1500
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x[:maximum_length])

In [19]:
minimum_length = 600
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.ljust(minimum_length))

In [20]:
print(df['cleaned_text'].apply(len).describe())


count    50000.000000
mean      1014.992520
std        356.805802
min        600.000000
25%        667.000000
50%        924.000000
75%       1500.000000
max       1500.000000
Name: cleaned_text, dtype: float64


In [21]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_text'].values, df['sentiment'].values, test_size=0.2, random_state=42)


In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
class ReviewDataset(Dataset):
    def __init__(self, review, labels):
        self.review = review
        self.labels = labels
    
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, idx):
        review = str(self.review[idx])
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(
            review, 
            add_special_tokens=True,
            max_length= 128,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label,dtype=torch.long)
        }
        

In [24]:
train_dataset = ReviewDataset(train_texts, train_labels)
val_dataset = ReviewDataset(val_texts, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

In [25]:
def train_model(model, train_loader, val_loader,epochs = 3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    model.train()
    
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/epochs")
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Training loss for epoch {epoch+1}: {total_loss / len(train_loader)}")
        
        model.eval()
        val_loss = 0
        preds, true_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels']
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                preds.extend(torch.argmax(outputs.logits, axis = 1).tolist())
                true_labels.extend(labels.tolist())
        accuracy = sum([p == t for p, t in zip(preds, true_labels)]) / len(true_labels)
        print(f"Validation Accuracy after epoch {epoch+1}: {accuracy}")
    return model
        

In [26]:
trained_model = train_model(model, train_dataloader, val_dataloader)
torch.save(trained_model.state_dict(), 'Model/BERT_Model.pt')
print("Model has been saved")



Epoch 1/epochs
Training loss for epoch 1: 0.32617951369807124
Validation Accuracy after epoch 1: 0.8824
Epoch 2/epochs
Training loss for epoch 2: 0.17833268203828484
Validation Accuracy after epoch 2: 0.8899
Epoch 3/epochs
Training loss for epoch 3: 0.06506620869969483
Validation Accuracy after epoch 3: 0.8811
Model has been saved
