In [None]:
import pandas as pd

# Load dataset
data_path = "../data/clean/merged-labeled/merged_labeled_dataset_61.csv"
merged_data = pd.read_csv(data_path)

# Print dataset information
print(merged_data.head())
print("Number of samples:", len(merged_data))


In [None]:
# Map sentiments to numerical labels
label_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
merged_data['sentiment_encoded'] = merged_data['sentiment'].map(label_mapping)

# Check the mapping
print(merged_data.head())


In [None]:
from transformers import BertTokenizer
import torch

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize reviews
encodings = tokenizer(
    list(merged_data['reviews']),
    truncation=True,
    padding=True,
    max_length=128,  # Limit the sequence length
    return_tensors="pt"  # Return PyTorch tensors
)

# Convert labels to tensors
labels = torch.tensor(merged_data['sentiment_encoded'].values)

print("Tokenization completed!")
print("Input IDs shape:", encodings['input_ids'].shape)
print("Labels shape:", labels.shape)


In [None]:
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

# Create Dataset and DataLoader
dataset = SentimentDataset(encodings, labels)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

print("DataLoader ready!")


In [None]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3 sentiment classes


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
num_training_steps = len(train_loader) * 2  # 2 epochs
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps
)

print("Optimizer and scheduler set up!")


In [None]:
import torch

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(2):  # Train for 2 epochs
    print(f"Starting epoch {epoch + 1}...")
    for batch_idx, batch in enumerate(train_loader):
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        # Forward pass
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Print loss for each batch
        print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {loss.item()}")

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1} completed!")

print("Training completed!")


In [None]:
# Save the model and tokenizer
model.save_pretrained("../models/sentiment_bert_model")
tokenizer.save_pretrained("../models/sentiment_bert_model_tokeniser")

print("Model saved!")


In [None]:
from sklearn.metrics import classification_report

# Put model in evaluation mode
model.eval()

# Collect predictions and true labels
predictions, true_labels = [], []

with torch.no_grad():
    for batch in train_loader:  # Replace with test_loader for test data
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=["Negative", "Neutral", "Positive"]))


In [5]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="../models/sentiment_bert_model",  # Path to the directory containing your model files
    config="../models/sentiment_bert_model/config.json"  # Path to config.json
)
model.to(device)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="../models/sentiment_bert_model_tokeniser",  # Path to the directory containing tokenizer files
    config="../models/sentiment_bert_model_tokeniser/tokenizer_config.json",
    vocab_file="../models/sentiment_bert_model_tokeniser/vocab.txt",
    special_tokens_map_file="../models/sentiment_bert_model_tokeniser/special_tokens_map.json"
)


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from transformers import AdamW, get_scheduler
from sklearn.metrics import classification_report
import torch
import numpy as np

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Load the Model and Tokenizer
print("Loading model and tokenizer...")
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="../models/sentiment_bert_model",  # Path to the directory containing your model files
    config="../models/sentiment_bert_model/config.json"
)
model.load_state_dict(torch.load("model.safetensors", map_location=device))
model.to(device)

tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="../models/sentiment_bert_model_tokeniser",  # Path to the directory containing tokenizer files
    config="../models/sentiment_bert_model_tokeniser/tokenizer_config.json",
    vocab_file="../models/sentiment_bert_model_tokeniser/vocab.txt",
    special_tokens_map_file="../models/sentiment_bert_model_tokeniser/special_tokens_map.json"
)

# Step 2: Prepare Dataset
print("Preparing dataset...")
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

# Test Data (Update as needed)
test_reviews = [
    "I love this product!",
    "This is the worst thing ever.",
    "It's okay, not great.",
    "Amazing quality, very satisfied!",
    "Terrible, I hate it."
]
test_labels = [2, 0, 1, 2, 0]  # Positive=2, Neutral=1, Negative=0

# Tokenize
encodings = tokenizer(test_reviews, truncation=True, padding=True, max_length=128, return_tensors="pt")
labels = torch.tensor(test_labels)

# Dataset and DataLoader
test_dataset = SentimentDataset(encodings, labels)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Step 3: Adjust Class Weights for Imbalanced Data
print("Calculating class weights...")
unique, counts = np.unique(test_labels, return_counts=True)
class_weights = torch.tensor(1.0 / counts, dtype=torch.float32)  # Inverse frequency
class_weights = class_weights.to(device)
print("Class weights:", class_weights)

# Use custom CrossEntropyLoss
loss_fn = CrossEntropyLoss(weight=class_weights)

# Step 4: Evaluate the Model
print("Evaluating the model...")
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        # Calculate loss (for monitoring purposes)
        loss = loss_fn(logits, labels)
        print(f"Batch Loss: {loss.item()}")

        # Store predictions and true labels
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Step 5: Generate Classification Report
print("Classification Report:")
print(classification_report(
    true_labels, predictions, target_names=["Negative", "Neutral", "Positive"], zero_division=0
))


In [7]:
from torch.utils.data import Dataset, DataLoader

# Tokenize the test data
encodings = tokenizer(
    test_reviews,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

# Create a Dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

test_dataset = SentimentDataset(encodings, torch.tensor(test_labels))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)


In [8]:
from sklearn.metrics import classification_report

# Put model in evaluation mode
model.eval()

# Collect predictions and true labels
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        # Store predictions and true labels
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=["Negative", "Neutral", "Positive"]))


Classification Report:
              precision    recall  f1-score   support

    Negative       0.50      1.00      0.67         1
     Neutral       0.00      0.00      0.00         1
    Positive       1.00      1.00      1.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
print("True Labels:", true_labels)
print("Predictions:", predictions)


True Labels: [2, 0, 1]
Predictions: [2, 0, 0]


In [10]:
import numpy as np
unique, counts = np.unique(true_labels, return_counts=True)
print("Label Distribution in True Labels:", dict(zip(unique, counts)))


Label Distribution in True Labels: {0: 1, 1: 1, 2: 1}


In [11]:
print("Logits for the last batch:", logits)

Logits for the last batch: tensor([[-2.3976, -2.1570,  4.1787],
        [ 2.6082, -1.3848, -1.3932],
        [ 0.8787, -1.3405,  0.6044]])
