# Module 6 Week 4
* Sentiment Analysis using BERT

Step1:

In [1]:
import pandas as pd
import os

# Set the paths to the train and test directories
train_dir = r'C:\Users\User\Desktop\ELU\weekly assignments\module6\week4\aclImdb\train'
test_dir = r'C:\Users\User\Desktop\ELU\weekly assignments\module6\week4\aclImdb\test'

# Load the training data
train_data = []
for label in ['pos', 'neg']:
    path = os.path.join(train_dir, label)
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
            review = file.read()
            train_data.append([review, label])

train_df = pd.DataFrame(train_data, columns=['review', 'label'])

# Load the testing data
test_data = []
for label in ['pos', 'neg']:
    path = os.path.join(test_dir, label)
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
            review = file.read()
            test_data.append([review, label])

test_df = pd.DataFrame(test_data, columns=['review', 'label'])


In [6]:
train_df.head()

Unnamed: 0,review,label
0,Bromwell High is a cartoon comedy. It ran at t...,pos
1,Homelessness (or Houselessness as George Carli...,pos
2,Brilliant over-acting by Lesley Ann Warren. Be...,pos
3,This is easily the most underrated film inn th...,pos
4,This is not the typical Mel Brooks film. It wa...,pos


In [7]:
test_df.head()

Unnamed: 0,review,label
0,I went and saw this movie last night after bei...,pos
1,Actor turned director Bill Paxton follows up h...,pos
2,As a recreational golfer with some knowledge o...,pos
3,"I saw this film in a sneak preview, and it is ...",pos
4,Bill Paxton has taken the true story of the 19...,pos


Step2:

In [8]:
from transformers import BertModel, BertTokenizer

# Load the pre-trained BERT model
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)

# Load the corresponding tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 571kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading model.safetensors: 100%|██████████| 440M/440M [01:29<00:00, 4.91MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a B

* The rest of the code has not runned due to an issue with the 3 step even though i tried to reduce the hyperparameteres,number of epochs and size of the dataset (Tried to use only 0,05% of the dataset) and the code had not executed till 140 minutes.

Step3:

In [22]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm

# Define a custom dataset class for IMDb reviews
class IMDbDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        review = self.data.iloc[index]['review']
        label = self.data.iloc[index]['label']
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }

# Define hyperparameters and training settings
batch_size = 16
max_length = 512
learning_rate = 2e-5
epochs = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create data loaders for training and testing
train_dataset = IMDbDataset(train_df, tokenizer, max_length)
test_dataset = IMDbDataset(test_df, tokenizer, max_length)

# Move the model and data loaders to the appropriate device
model.to(device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)


# Set the pre-trained model to training mode
model.train()

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Fine-tune the model on the IMDb dataset
for epoch in range(epochs):
    total_loss = 0
    total_correct = 0
    
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        loss = criterion(outputs.logits, labels)
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        total_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
        
        progress_bar.set_postfix({'loss': total_loss / len(train_loader), 'accuracy': total_correct / len(train_dataset)})
    
    # Evaluate the model on the test set after each epoch
    model.eval()
    test_loss = 0
    test_correct = 0
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            
            loss = criterion(outputs.logits, labels)
            
            test_loss += loss.item()
            test_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
    
    average_test_loss = test_loss / len(test_loader)
    test_accuracy = test_correct / len(test_dataset)
    
    print(f'Epoch {epoch + 1}/{epochs} - Test Loss: {average_test_loss:.4f} - Test Accuracy: {test_accuracy:.4f}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')


Epoch 1/3:   0%|          | 0/1250 [00:00<?, ?it/s]

* I couldn't also run the 4th step because i did not executed the 3rd step, but this is how i would do it if i could execute the 3rd step.

Step 4:

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set the fine-tuned model to evaluation mode
model.eval()

# Initialize lists to store the predictions and labels
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_labels = outputs.logits.argmax(dim=1)
        
        all_predictions.extend(predicted_labels.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert the lists to NumPy arrays
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calculate the evaluation metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
f1 = f1_score(all_labels, all_predictions, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
