In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sklearn

In [3]:
!SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install sklearn



In [4]:
# Step 1: Install required libraries
!pip install transformers datasets accelerate pandas sklearn torch



In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # Updated here
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from sklearn.metrics import f1_score, classification_report
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
# Step 3: Load and prepare dataset
df = pd.read_csv("merged_dataset.csv")

In [7]:
# Map sentiments to numerical labels
sentiment_map = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["sentiment"].map(sentiment_map)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from datasets import Dataset


In [9]:
# Split dataset (stratify by sentiment)
train_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42,
    stratify=df["sentiment"]
)

In [10]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
test_dataset = Dataset.from_pandas(test_df[["text", "label"]])

In [11]:
# Step 4: Initialize BioBERT tokenizer
model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Apply tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/805 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [13]:
# Step 6: Format datasets for PyTorch
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

In [14]:
# Step 7: Create DataLoaders
train_loader = DataLoader(
    tokenized_train, 
    batch_size=16, 
    shuffle=True
)
test_loader = DataLoader(
    tokenized_test, 
    batch_size=16
)

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,  # negative, neutral, positive
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2},
    ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Step 9: Set up training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
epochs = 20
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [25]:
# Step 10: Training loop
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in data_loader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() 
                  if k in ['input_ids', 'attention_mask', 'labels']}
        
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(data_loader)

In [26]:
# Step 11: Evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            inputs = {k: v.to(device) for k, v in batch.items() 
                      if k in ['input_ids', 'attention_mask']}
            labels = batch['labels'].to(device)
            
            outputs = model(**inputs)
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    
    return classification_report(true_labels, predictions, target_names=["negative", "neutral", "positive"]), f1_score(true_labels, predictions, average="macro")

In [27]:
# Step 12: Train and evaluate
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, scheduler, device)
    report, f1 = evaluate(model, test_loader, device)
    
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation F1 (Macro): {f1:.4f}")
    print("Classification Report:")
    print(report)
    print("\n" + "="*50 + "\n")

Epoch 1/20
Train Loss: 0.0403
Validation F1 (Macro): 0.9802
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.97      0.98        61
     neutral       1.00      0.97      0.99        74
    positive       0.94      1.00      0.97        67

    accuracy                           0.98       202
   macro avg       0.98      0.98      0.98       202
weighted avg       0.98      0.98      0.98       202



Epoch 2/20
Train Loss: 0.0313
Validation F1 (Macro): 0.9800
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.95      0.97        61
     neutral       0.97      0.99      0.98        74
    positive       0.97      1.00      0.99        67

    accuracy                           0.98       202
   macro avg       0.98      0.98      0.98       202
weighted avg       0.98      0.98      0.98       202



Epoch 3/20
Train Loss: 0.0141
Validation F1 (Macro): 0.9807
Classifi

In [28]:
# Step 14: Inference function
def predict_sentiment(text, model, tokenizer, device):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_label = torch.argmax(probs, dim=1).item()
    
    return {
        "prediction": model.config.id2label[pred_label],
        "confidence": probs[0][pred_label].item(),
        "probabilities": {
            "negative": probs[0][0].item(),
            "neutral": probs[0][1].item(),
            "positive": probs[0][2].item()
        }
    }

In [81]:
# Example usage:
sample_text = "this is terrible"
result = predict_sentiment(sample_text, model, tokenizer, device)
print(f"Text: {sample_text}")
print(f"Predicted sentiment: {result['prediction']}")
print(f"Confidence: {result['confidence']:.4f}")

Text: this is terrible
Predicted sentiment: negative
Confidence: 0.9999
