# BERT Model Training for Sentiment Classification
This notebook fine-tunes a BERT model to classify YouTube transcripts into sentiment categories.

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np


In [None]:
# Load labeled transcript data
df = pd.read_excel("labeled_transcripts.xlsx")
transcripts = df['Transcript'].tolist()
labels = df['Label'].tolist()


In [None]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_texts, val_texts, train_labels, val_labels = train_test_split(
    transcripts, labels, test_size=0.3, random_state=42
)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [None]:
# PyTorch dataset class
class TranscriptDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TranscriptDataset(train_encodings, train_labels)
val_dataset = TranscriptDataset(val_encodings, val_labels)


In [None]:
# Load BERT model with 3 sentiment classes (positive, neutral, negative)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


In [None]:
# Training configuration
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    evaluation_strategy="epoch",
    logging_dir='./logs',
)


In [None]:
# Training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


In [None]:
# Final evaluation: F1 Score
predictions, true_labels, _ = trainer.predict(val_dataset)
predicted_labels = np.argmax(predictions, axis=1)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
print("F1 Score:", f1)
