In [1]:
pip install transformers torch nltk


Note: you may need to restart the kernel to use updated packages.


In [13]:
# Import necessary libraries
# Step 2: Import Libraries and Load the Dataset
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import accuracy_score

# Load the train and test datasets from separate CSV files
train_file_path = 'csv files/Kitchen1.csv'  # Replace with the path to your training CSV file
test_file_path = 'csv files/DVD11.csv'    # Replace with the path to your testing CSV file

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Display the first few rows of train and test data
print("Training Data:\n", train_data.head())
print("Test Data:\n", test_data.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rupam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training Data:
    star_rating                                        review_body
0            1  As a family allergic to wheat, dairy, eggs, nu...
1            1  My favorite nut.  Creamy, crunchy, salty, and ...
2            1  This green tea tastes so good! My girlfriend l...
3            1  I love Melissa's brand but this is a great sec...
4            1                                               good
Test Data:
    star_rating                                        review_body
0            1  I loved it and I wish there was a season 3... ...
1            1  As always it seems that the best shows come fr...
2            1  This movie isn't perfect, but it gets a lot of...
3            1                excellant this is what tv should be
4            1  Brilliant film from beginning to end. All of t...


In [14]:
# Step 3: Data Preprocessing (Text Cleaning, Tokenization, Padding)
import torch
import re
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define text cleaning function
def clean_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        return text
    return None  # Return None for non-string inputs

# Apply text cleaning and drop rows with None values for training data
train_data['cleaned_review'] = train_data['review_body'].apply(clean_text)
train_data = train_data.dropna(subset=['cleaned_review'])  # Drop rows where 'cleaned_review' is None

# Apply text cleaning and drop rows with None values for test data
test_data['cleaned_review'] = test_data['review_body'].apply(clean_text)
test_data = test_data.dropna(subset=['cleaned_review'])  # Drop rows where 'cleaned_review' is None

# Tokenize the reviews using BERT tokenizer

def encode_review(text):
    return tokenizer.encode_plus(
        text, 
        add_special_tokens=True, 
        max_length=max_length, 
        padding='max_length', 
        truncation=True, 
        return_attention_mask=True, 
        return_tensors='pt'
    )

# Apply tokenization to the training dataset
train_input_ids = []
train_attention_masks = []

for review in train_data['cleaned_review']:
    encoded_review = encode_review(review)
    train_input_ids.append(encoded_review['input_ids'])
    train_attention_masks.append(encoded_review['attention_mask'])

# Apply tokenization to the test dataset
test_input_ids = []
test_attention_masks = []

for review in test_data['cleaned_review']:
    encoded_review = encode_review(review)
    test_input_ids.append(encoded_review['input_ids'])
    test_attention_masks.append(encoded_review['attention_mask'])

# Convert lists to tensors for training and test datasets
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

# Use the existing sentiment labels (0 or 1) directly from the CSV file
train_labels = torch.tensor(train_data['star_rating'].values)
test_labels = torch.tensor(test_data['star_rating'].values)

# Print a few samples
print("Sample train input IDs:", train_input_ids[0])
print("Sample train attention mask:", train_attention_masks[0])
print("Sample train label:", train_labels[0])

print("Sample test input IDs:", test_input_ids[0])
print("Sample test attention mask:", test_attention_masks[0])
print("Sample test label:", test_labels[0])




Sample train input IDs: tensor([  101,  2004,  1037,  2155, 27395,  2000, 10500, 11825,  6763, 12264,
         1998,  2195,  2060,  2477,  2057,  2293,  1996,  2972, 26369,  2015,
         2173,  2240,  1997,  3688,  2004,  2009,  4473,  2149,  2000,  8670,
         3489, 18452,  2007, 10124,  3947,  1998, 12760,  2087,  2035, 24395,
        23301,  1998,  1043,  7630,  6528, 23301, 21109,  2788,  2074, 18168,
         4183,  2028,  2030,  2048,  2035,  2121, 21230,  2012,  2087,  2061,
         2049,  2307,  2000,  2156,  1037,  4666,  2580,  2302,  2116,  1997,
         1996,  2087,  2691,  2035,  2121, 21230,  3602,  2122,  2145,  2031,
        25176,  1998,  9781,  2057, 16678,  2122,  2006,  1037,  3180,  3978,
         1998,  2031,  2042,  2725,  2061,  2005,  2086,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     

In [15]:
# Step 4: Prepare Data for Training
from torch.utils.data import TensorDataset, DataLoader

# Create PyTorch DataLoader for training and test sets

batch_size = 16

# Training set
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Test set
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Print to confirm successful creation of dataloaders
print(f"Training data size: {len(train_dataloader.dataset)}")
print(f"Test data size: {len(test_dataloader.dataset)}")




Training data size: 204687
Test data size: 12446


In [16]:
# Step 5: Load BERT Model and Set Up Optimizer
from transformers import BertForSequenceClassification, AdamW, get_scheduler
import torch

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Print whether the program is executing on CPU or GPU
if device.type == 'cuda':
    print("Model is running on GPU")
else:
    print("Model is running on CPU")

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define learning rate scheduler
epochs = 3
num_training_steps = epochs * len(train_dataloader)  # Make sure train_dataloader is defined earlier
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is running on CPU




In [12]:
# Step 6: Training the BERT Model
from tqdm.auto import tqdm

# Initialize the progress bar for tracking training progress
progress_bar = tqdm(range(num_training_steps))

model.train()  # Set model to training mode
for epoch in range(epochs):
    epoch_loss = 0  # Track loss for the epoch
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
        
        # Forward pass
        outputs = model(batch_input_ids, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        epoch_loss += loss.item()  # Accumulate loss for the epoch
        loss.backward()

        # Optimizer and scheduler step
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Print average loss for the epoch
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} completed. Average Loss: {avg_epoch_loss:.4f}")



  0%|          | 0/2103 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Step 7: Evaluate the Model on the Validation Set
from sklearn.metrics import accuracy_score
import torch

# Set model to evaluation mode
model.eval()

all_preds = []
all_labels = []

for batch in val_dataloader:
    batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
    
    # No gradient calculation during evaluation
    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_masks)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    labels = batch_labels.cpu().numpy()
    
    all_preds.extend(preds)
    all_labels.extend(labels)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Step 8: Save the Model (Optional)
# Save the fine-tuned model for future use
model.save_pretrained('./fine_tuned_bert_sentiment_model')
tokenizer.save_pretrained('./fine_tuned_bert_sentiment_model')
