In [1]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer, AlbertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [2]:
train_file_path = '/kaggle/input/reviews/DVD11.csv'  # Replace with your train file path
train_data = pd.read_csv(train_file_path, nrows=20000)

# Check the shape to confirm
print(train_data.shape)

(12450, 2)


In [3]:
# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]



In [4]:
# Define text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text
    return None

# Apply text cleaning
train_data['cleaned_review'] = train_data['review_body'].apply(clean_text)
train_data = train_data.dropna(subset=['cleaned_review'])

In [5]:
# Tokenize and encode data using BERT tokenizer
max_length = 128

def encode_review_bert(text):
    return bert_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

input_ids_bert = []
attention_masks_bert = []

for review in train_data['cleaned_review']:
    encoded_review = encode_review_bert(review)
    input_ids_bert.append(encoded_review['input_ids'])
    attention_masks_bert.append(encoded_review['attention_mask'])

# Convert lists to tensors
input_ids_bert = torch.cat(input_ids_bert, dim=0)
attention_masks_bert = torch.cat(attention_masks_bert, dim=0)

In [6]:
# Use existing sentiment labels
labels = torch.tensor(train_data['star_rating'].values)

# Split data into training and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids_bert, attention_masks_bert, labels, test_size=0.1, random_state=42
)

In [7]:
# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

In [8]:
# Initialize ALBERT model for sequence classification
albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import AlbertForSequenceClassification, AdamW, get_scheduler
import torch

# Load pre-trained BERT model for sequence classification
albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
albert_model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(albert_model.parameters(), lr=2e-5)
epochs = 5
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Training loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
albert_model.train()

for epoch in range(epochs):
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
        outputs = albert_model(batch_input_ids, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print(f"Epoch {epoch+1}/{epochs} completed. Loss: {loss.item()}")



  0%|          | 0/3505 [00:00<?, ?it/s]

Epoch 1/5 completed. Loss: 1.3170685768127441
Epoch 2/5 completed. Loss: 0.013247331604361534
Epoch 3/5 completed. Loss: 0.004870930220931768
Epoch 4/5 completed. Loss: 0.0038811846170574427
Epoch 5/5 completed. Loss: 0.0025908732786774635


In [11]:
# Save the model
albert_model.save_pretrained('./fine_tuned_bert_sentiment_model')
bert_tokenizer.save_pretrained('./fine_tuned_bert_sentiment_model')

('./fine_tuned_bert_sentiment_model/tokenizer_config.json',
 './fine_tuned_bert_sentiment_model/special_tokens_map.json',
 './fine_tuned_bert_sentiment_model/vocab.txt',
 './fine_tuned_bert_sentiment_model/added_tokens.json')

In [12]:
# Load test data
test_file_path = '/kaggle/input/reviews/Books11.csv'  # Replace with your test file path
test_data = pd.read_csv(test_file_path, nrows= 20000) #nrows is to shrink no. of rows to 20k

# Apply text cleaning
test_data['cleaned_review'] = test_data['review_body'].apply(clean_text)
test_data = test_data.dropna(subset=['cleaned_review'])

# Tokenize and encode data
input_ids = []
attention_masks = []

for review in test_data['cleaned_review']:
    encoded_review = encode_review_bert(review)
    input_ids.append(encoded_review['input_ids'])
    attention_masks.append(encoded_review['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Use existing sentiment labels
labels = torch.tensor(test_data['star_rating'].values)

# Create DataLoader
test_data = TensorDataset(input_ids, attention_masks, labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the saved model and tokenizer
albert_model = AlbertForSequenceClassification.from_pretrained('./fine_tuned_bert_sentiment_model')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert_sentiment_model')

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
albert_model.to(device)

# Evaluation on test set
albert_model.eval()
all_preds = []
all_labels = []

for batch in test_dataloader:
    batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
    with torch.no_grad():
        outputs = albert_model(batch_input_ids, attention_mask=batch_masks)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    labels = batch_labels.cpu().numpy()

    all_preds.extend(preds)
    all_labels.extend(labels)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
# Calculate metrics for binary classification
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='binary')
recall = recall_score(all_labels, all_preds, average='binary')
f1 = f1_score(all_labels, all_preds, average='binary')

# Print the metrics in one line
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")


Accuracy: 0.85, Precision: 0.90, Recall: 0.94, F1-score: 0.92
