In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['selected_text'])
        sentiment_str = self.data.iloc[idx]['sentiment']

        # Convert sentiment string to numeric label
        sentiment = self.sentiment_mapping[sentiment_str]

        # Encode text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(sentiment, dtype=torch.long)
        }


In [30]:
df = pd.read_csv('train.csv')

In [40]:
MAX_LEN = 128  # Max length of input tokens
BATCH_SIZE = 16
EPOCHS = 1
LEARNING_RATE = 2e-5

In [41]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [43]:
for param in bert_model.parameters():
    param.requires_grad = False

In [44]:
# Define sentiment classifier model
class SentimentClassifier(nn.Module):
    def __init__(self, bert_model):
        super(SentimentClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 3)  # BERT 'hidden_size' is 768 for 'bert-base-uncased', 3 classes (negative, neutral, positive)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

In [45]:
# Create datasets and data loaders
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [46]:
model = SentimentClassifier(bert_model)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [47]:
criterion = nn.CrossEntropyLoss()

In [48]:
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    # Calculate average loss over all batches
    avg_train_loss = train_loss / len(train_loader)

    # Evaluation on validation set
    model.eval()
    val_preds = []
    val_true = []
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_true, val_preds)
    val_classification_report = classification_report(val_true, val_preds, target_names=['negative', 'neutral', 'positive'])

    print(f'Epoch {epoch + 1}/{EPOCHS}, '
          f'Train Loss: {avg_train_loss:.4f}, '
          f'Val Loss: {avg_val_loss:.4f}, '
          f'Val Accuracy: {val_accuracy:.4f}')
    print(f'Classification Report:\n{val_classification_report}')


Epoch 1/1, Train Loss: 0.9794, Val Loss: 0.9493, Val Accuracy: 0.5399
Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.03      0.05      1562
     neutral       0.56      0.90      0.69      2230
    positive       0.49      0.54      0.52      1705

    accuracy                           0.54      5497
   macro avg       0.56      0.49      0.42      5497
weighted avg       0.56      0.54      0.46      5497

