In [None]:
import pandas as pd

# Load dataset
data_path = "../data/clean/merged-labeled/merged_labeled_dataset_61.csv"
merged_data = pd.read_csv(data_path)

# Print dataset information
print(merged_data.head())
print("Number of samples:", len(merged_data))


In [None]:
# Map sentiments to numerical labels
label_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
merged_data['sentiment_encoded'] = merged_data['sentiment'].map(label_mapping)

# Check the mapping
print(merged_data.head())


In [None]:
from transformers import BertTokenizer
import torch

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize reviews
encodings = tokenizer(
    list(merged_data['reviews']),
    truncation=True,
    padding=True,
    max_length=128,  # Limit the sequence length
    return_tensors="pt"  # Return PyTorch tensors
)

# Convert labels to tensors
labels = torch.tensor(merged_data['sentiment_encoded'].values)

print("Tokenization completed!")
print("Input IDs shape:", encodings['input_ids'].shape)
print("Labels shape:", labels.shape)


In [None]:
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

# Create Dataset and DataLoader
dataset = SentimentDataset(encodings, labels)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

print("DataLoader ready!")


In [None]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3 sentiment classes


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
num_training_steps = len(train_loader) * 2  # 2 epochs
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps
)

print("Optimizer and scheduler set up!")


In [None]:
import torch

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(2):  # Train for 2 epochs
    print(f"Starting epoch {epoch + 1}...")
    for batch_idx, batch in enumerate(train_loader):
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        # Forward pass
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Print loss for each batch
        print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {loss.item()}")

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1} completed!")

print("Training completed!")


In [None]:
# Save the model and tokenizer
model.save_pretrained("../models/sentiment_bert_model")
tokenizer.save_pretrained("../models/sentiment_bert_model_tokeniser")

print("Model saved!")


In [None]:
from sklearn.metrics import classification_report

# Put model in evaluation mode
model.eval()

# Collect predictions and true labels
predictions, true_labels = [], []

with torch.no_grad():
    for batch in train_loader:  # Replace with test_loader for test data
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=["Negative", "Neutral", "Positive"]))


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="../models/sentiment_bert_model",  # Path to the directory containing your model files
    config="../models/sentiment_bert_model/config.json"  # Path to config.json
)
model.to(device)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="../models/sentiment_bert_model_tokeniser",  # Path to the directory containing tokenizer files
    config="../models/sentiment_bert_model_tokeniser/tokenizer_config.json",
    vocab_file="../models/sentiment_bert_model_tokeniser/vocab.txt",
    special_tokens_map_file="../models/sentiment_bert_model_tokeniser/special_tokens_map.json"
)


In [None]:
from torch.utils.data import Dataset, DataLoader

# Tokenize the test data
encodings = tokenizer(
    test_reviews,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

# Create a Dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

test_dataset = SentimentDataset(encodings, torch.tensor(test_labels))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)


In [None]:
from sklearn.metrics import classification_report

# Put model in evaluation mode
model.eval()

# Collect predictions and true labels
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        # Store predictions and true labels
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=["Negative", "Neutral", "Positive"]))


In [None]:
print("True Labels:", true_labels)
print("Predictions:", predictions)


In [None]:
import numpy as np
unique, counts = np.unique(true_labels, return_counts=True)
print("Label Distribution in True Labels:", dict(zip(unique, counts)))


In [None]:
print("Logits for the last batch:", logits)

### **New Train**

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
import torch
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [None]:
# Step 1: Load and Preprocess the Dataset
# Assuming merged_data is your DataFrame with 'reviews' and 'sentiment_encoded' columns
print("Loading dataset...")
data_path = "../data/clean/merged-labeled/merged_labeled_dataset_61.csv"
merged_data = pd.read_csv(data_path)

In [None]:
# Map sentiments to numerical labels
label_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
merged_data['sentiment_encoded'] = merged_data['sentiment'].map(label_mapping)

# Check the mapping
print(merged_data.head())


In [None]:
print("Original Class Distribution:")
print(merged_data['sentiment_encoded'].value_counts())

In [None]:
positive_class = merged_data[merged_data['sentiment_encoded'] == 2]
negative_class = merged_data[merged_data['sentiment_encoded'] == 0]
neutral_class = merged_data[merged_data['sentiment_encoded'] == 1]

In [None]:
oversampled_negative = resample(
    negative_class,
    replace=True,  # Allow duplicates
    n_samples=len(positive_class),  # Match majority class size
    random_state=42
)

In [None]:
oversampled_neutral = resample(
    neutral_class,
    replace=True,  # Allow duplicates
    n_samples=len(positive_class),  # Match majority class size
    random_state=42
)

In [None]:
balanced_data = pd.concat([positive_class, oversampled_negative, oversampled_neutral])

In [None]:
print("Balanced Class Distribution:")
print(balanced_data['sentiment_encoded'].value_counts())

In [None]:
# Tokenize the balanced dataset
encodings = tokenizer(
    list(balanced_data['reviews']),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)
labels = torch.tensor(balanced_data['sentiment_encoded'].values)


In [None]:
from torch.utils.data import Dataset, DataLoader

# Define Dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

dataset = SentimentDataset(encodings, labels)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)


In [None]:
from transformers import BertForSequenceClassification
from torch.nn import CrossEntropyLoss

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

# Loss function
loss_fn = CrossEntropyLoss()


In [None]:
from transformers import AdamW, get_scheduler

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps)

# Training loop
model.train()
for epoch in range(3):
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        if batch_idx % 10 == 0:
            print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {loss.item()}")

    print(f"Epoch {epoch + 1} completed. Average Loss: {total_loss / len(train_loader):.4f}")

# Save the model
model.save_pretrained("sentiment_bert_model")
tokenizer.save_pretrained("sentiment_bert_model")
