In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your dataset
df = pd.read_csv("processed_chatbot_data.csv")

# Map sentiment labels to numeric values
sentiment_map = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}
df['label'] = df['sentiment'].map(sentiment_map)

# Filter out rows with NaN in clean_text or label
df = df.dropna(subset=['clean_text', 'label'])
print("Rows after filtering:", len(df))

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Preprocessing function
def preprocess(texts):
    cleaned_texts = [str(text) for text in texts if pd.notnull(text)]
    if not cleaned_texts:
        raise ValueError("No valid text entries found after cleaning.")
    return tokenizer(cleaned_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Prepare dataset
inputs = preprocess(df['clean_text'].tolist())
labels = torch.tensor(df['label'].tolist())

# Verify lengths match
print("Inputs length:", inputs['input_ids'].shape[0])
print("Labels length:", labels.shape[0])
assert inputs['input_ids'].shape[0] == labels.shape[0], "Mismatch between inputs and labels!"

# Split into train and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    inputs['input_ids'], labels, test_size=0.2, random_state=42
)
train_masks = train_inputs.ne(0).float()
test_masks = test_inputs.ne(0).float()

# Convert to torch Dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(
    {'input_ids': train_inputs, 'attention_mask': train_masks}, train_labels
)
test_dataset = SentimentDataset(
    {'input_ids': test_inputs, 'attention_mask': test_masks}, test_labels
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./sentiment_bert_model")
tokenizer.save_pretrained("./sentiment_bert_model")

print("Model training complete!")

Rows after filtering: 283240


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Inputs length: 283240
Labels length: 283240




Epoch,Training Loss,Validation Loss


## Testing

In [None]:
model = BertForSequenceClassification.from_pretrained("./sentiment_bert_model")
tokenizer = BertTokenizer.from_pretrained("./sentiment_bert_model")
model.eval()

def predict_sentiment(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return {0: "Extremely Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Extremely Positive"}[predicted_class]

test_texts = ["game hurt", "man love reddit", "right dont care fuck em"]
for text in test_texts:
    print(f"Input: {text}")
    print(f"Predicted Sentiment: {predict_sentiment(text)}")