In [None]:
# this is for small testing
import pandas as pd
import re
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split



In [None]:
train_file_path = '/content/DVD11.csv'  # Replace with your train file path
train_data = pd.read_csv(train_file_path, nrows=20000)

# Check the shape to confirm
print(train_data.shape)

In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text
    return None

# Apply text cleaning
train_data['cleaned_review'] = train_data['review_body'].apply(clean_text)
train_data = train_data.dropna(subset=['cleaned_review'])

# Tokenize and encode data
max_length = 128

def encode_review(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

input_ids = []
attention_masks = []

for review in train_data['cleaned_review']:
    encoded_review = encode_review(review)
    input_ids.append(encoded_review['input_ids'])
    attention_masks.append(encoded_review['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Use existing sentiment labels
labels = torch.tensor(train_data['star_rating'].values)

# Split data into training and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.1, random_state=42
)

# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

In [None]:
from transformers import  AdamW, get_scheduler
import torch

# Load pre-trained BERT model for sequence classification
from transformers import RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained('roberta-base')


# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
# Training loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(epochs):
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
        outputs = model(batch_input_ids, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print(f"Epoch {epoch+1}/{epochs} completed. Loss: {loss.item()}")

# Save the model
model.save_pretrained('./fine_tuned_bert_sentiment_model')
tokenizer.save_pretrained('./fine_tuned_bert_sentiment_model')

In [None]:
# Save the model and tokenizer locally
model.save_pretrained('./fine_tuned_bert_sentiment_model')
tokenizer.save_pretrained('./fine_tuned_bert_sentiment_model')

# Compress the model folder into a ZIP file
import shutil
shutil.make_archive('bert_uncased_DVD', 'zip', './fine_tuned_bert_sentiment_model')

# Download the ZIP file to your local machine
from google.colab import files
files.download('bert_uncased_DVD.zip')


In [None]:
# Load test data
test_file_path = '/content/DVD11.csv'  # Replace with your test file path
test_data = pd.read_csv(test_file_path,nrows=20000)

# Apply text cleaning
test_data['cleaned_review'] = test_data['review_body'].apply(clean_text)
test_data = test_data.dropna(subset=['cleaned_review'])

# Tokenize and encode data
input_ids = []
attention_masks = []

for review in test_data['cleaned_review']:
    encoded_review = encode_review(review)
    input_ids.append(encoded_review['input_ids'])
    attention_masks.append(encoded_review['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Use existing sentiment labels
labels = torch.tensor(test_data['star_rating'].values)

# Create DataLoader
test_data = TensorDataset(input_ids, attention_masks, labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load the saved RoBERTa model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('./fine_tuned_roberta_sentiment_model')
tokenizer = RobertaTokenizer.from_pretrained('./fine_tuned_roberta_sentiment_model')


# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Evaluation on test set
model.eval()
all_preds = []
all_labels = []

for batch in test_dataloader:
    batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_masks)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    labels = batch_labels.cpu().numpy()

    all_preds.extend(preds)
    all_labels.extend(labels)


# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
# Calculate metrics for binary classification
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='binary')
recall = recall_score(all_labels, all_preds, average='binary')
f1 = f1_score(all_labels, all_preds, average='binary')

# Print the metrics in one line
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")
