In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

2023-10-11 19:01:21.761257: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define a custom sentiment classifier model
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

In [3]:
class SentimentDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=150):
        self.data = pd.read_excel(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        le = LabelEncoder()
        le.fit(self.data['Label'])
        self.labels_encoded = le.transform(self.data['Label'])
        self.labels = self.data['Label']
        unique_labels = np.unique(self.labels)
        self.num_classes = len(unique_labels)
        self.labels_onehot = np.eye(self.num_classes)[self.labels_encoded]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.iloc[idx, 1])
        label = self.labels_onehot[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.float32),
        }


In [4]:
# Load and preprocess the CSV data
train_csv_file = "ClassificationDataset-train0.xlsx"
val_csv_file = "ClassificationDataset-valid0.xlsx"

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = SentimentDataset(train_csv_file, tokenizer)
val_dataset = SentimentDataset(val_csv_file, tokenizer)

In [6]:
# Split the dataset into training and validation sets
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [7]:
num_classes=train_dataset.num_classes

In [8]:
train_dataset.labels_encoded

array([1, 2, 1, ..., 1, 1, 1])

In [9]:
model = SentimentClassifier(num_classes)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Training loop
num_epochs = 3  

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    # Validation
    model.eval()
    val_preds = []
    val_true_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["label"]
            
            logits = model(input_ids, attention_mask)
            _, predicted_labels = torch.max(logits, dim=1)
            val_preds.extend(predicted_labels.cpu().numpy())
            val_true_labels.extend(np.argmax(labels, axis=1).cpu().numpy())

    val_accuracy = accuracy_score(val_true_labels, val_preds)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Epoch 1/3: 100%|████████████████████████████████████████████████████████████████████████| 88/88 [05:42<00:00,  3.89s/it]


Epoch 1/3, Loss: 0.5851, Validation Accuracy: 0.8346


Epoch 2/3: 100%|████████████████████████████████████████████████████████████████████████| 88/88 [04:08<00:00,  2.82s/it]


Epoch 2/3, Loss: 0.3404, Validation Accuracy: 0.8238


Epoch 3/3: 100%|████████████████████████████████████████████████████████████████████████| 88/88 [04:09<00:00,  2.84s/it]


Epoch 3/3, Loss: 0.1983, Validation Accuracy: 0.8479


In [11]:
from sklearn.metrics import f1_score


val_f1_score = f1_score(val_true_labels, val_preds, average='micro')
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_train_loss:.4f}, Validation F1-Score: {val_f1_score:.4f}")


Epoch 3/3, Loss: 0.1983, Validation F1-Score: 0.8479
