In [2]:
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import os

In [3]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [4]:
train_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='unicode_escape')
test_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1')

In [5]:
# Define sentiment mapping and apply it to the DataFrame
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
train_df['sentiment'] = train_df['sentiment'].map(sentiment_mapping)
test_df.dropna(inplace=True)
test_df['sentiment'] = test_df['sentiment'].map(sentiment_mapping)


In [6]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer(
            text,
            None,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        target = self.data.sentiment[index]
        # Handle NaN values in the target column
        if pd.isnull(target):
            # If sentiment is NaN, return -1 as a placeholder or handle it according to your use case
            target = -1
        else:
            target = int(target)
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }


In [8]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32

In [9]:
train_dataset = SentimentDataset(train_df, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

test_dataset = SentimentDataset(test_df, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)


In [10]:
# Model and optimizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Checkpoint path
CHECKPOINT_PATH = "model_checkpoint.pt"

if os.path.isfile(CHECKPOINT_PATH):
    checkpoint = torch.load(CHECKPOINT_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    print(f"Resuming training from epoch {start_epoch+1}")
else:
    start_epoch = 0

In [12]:
def save_checkpoint(epoch, model, optimizer, filename=CHECKPOINT_PATH):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved at epoch {epoch+1}")


In [15]:
# Training
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

for epoch in tqdm(range(start_epoch, 3)):
    model.train()
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=targets)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    save_checkpoint(epoch, model, optimizer)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/859 [00:00<?, ?it/s][A
  0%|          | 1/859 [00:01<18:19,  1.28s/it][A
  0%|          | 2/859 [00:02<17:42,  1.24s/it][A
  0%|          | 3/859 [00:03<17:28,  1.22s/it][A
  0%|          | 4/859 [00:04<17:21,  1.22s/it][A
  1%|          | 5/859 [00:06<17:18,  1.22s/it][A
  1%|          | 6/859 [00:07<17:14,  1.21s/it][A
  1%|          | 7/859 [00:08<17:11,  1.21s/it][A
  1%|          | 8/859 [00:09<17:10,  1.21s/it][A
  1%|          | 9/859 [00:10<17:10,  1.21s/it][A
  1%|          | 10/859 [00:12<17:08,  1.21s/it][A
  1%|▏         | 11/859 [00:13<17:08,  1.21s/it][A
  1%|▏         | 12/859 [00:14<17:08,  1.21s/it][A
  2%|▏         | 13/859 [00:15<17:07,  1.21s/it][A
  2%|▏         | 14/859 [00:17<17:06,  1.21s/it][A
  2%|▏         | 15/859 [00:18<17:04,  1.21s/it][A
  2%|▏         | 16/859 [00:19<17:04,  1.22s/it][A
  2%|▏         | 17/859 [00:20<17:03,  1.22s/it][A
  2%|▏         | 18/859 [00:21<17:02,  1.22s

Checkpoint saved at epoch 1



  0%|          | 0/859 [00:00<?, ?it/s][A
  0%|          | 1/859 [00:01<19:02,  1.33s/it][A
  0%|          | 2/859 [00:02<18:04,  1.27s/it][A
  0%|          | 3/859 [00:03<17:51,  1.25s/it][A
  0%|          | 4/859 [00:05<17:38,  1.24s/it][A
  1%|          | 5/859 [00:06<17:33,  1.23s/it][A
  1%|          | 6/859 [00:07<17:29,  1.23s/it][A
  1%|          | 7/859 [00:08<17:30,  1.23s/it][A
  1%|          | 8/859 [00:09<17:30,  1.23s/it][A
  1%|          | 9/859 [00:11<17:27,  1.23s/it][A
  1%|          | 10/859 [00:12<17:23,  1.23s/it][A
  1%|▏         | 11/859 [00:13<17:21,  1.23s/it][A
  1%|▏         | 12/859 [00:14<17:19,  1.23s/it][A
  2%|▏         | 13/859 [00:16<17:15,  1.22s/it][A
  2%|▏         | 14/859 [00:17<17:13,  1.22s/it][A
  2%|▏         | 15/859 [00:18<17:14,  1.23s/it][A
  2%|▏         | 16/859 [00:19<17:14,  1.23s/it][A
  2%|▏         | 17/859 [00:20<17:11,  1.23s/it][A
  2%|▏         | 18/859 [00:22<17:09,  1.22s/it][A
  2%|▏         | 19/859 [00:2

Checkpoint saved at epoch 2



  0%|          | 0/859 [00:00<?, ?it/s][A
  0%|          | 1/859 [00:01<18:51,  1.32s/it][A
  0%|          | 2/859 [00:02<17:58,  1.26s/it][A
  0%|          | 3/859 [00:03<17:40,  1.24s/it][A
  0%|          | 4/859 [00:04<17:37,  1.24s/it][A
  1%|          | 5/859 [00:06<17:33,  1.23s/it][A
  1%|          | 6/859 [00:07<17:29,  1.23s/it][A
  1%|          | 7/859 [00:08<17:24,  1.23s/it][A
  1%|          | 8/859 [00:09<17:23,  1.23s/it][A
  1%|          | 9/859 [00:11<17:20,  1.22s/it][A
  1%|          | 10/859 [00:12<17:18,  1.22s/it][A
  1%|▏         | 11/859 [00:13<17:18,  1.22s/it][A
  1%|▏         | 12/859 [00:14<17:19,  1.23s/it][A
  2%|▏         | 13/859 [00:15<17:15,  1.22s/it][A
  2%|▏         | 14/859 [00:17<17:14,  1.22s/it][A
  2%|▏         | 15/859 [00:18<17:13,  1.22s/it][A
  2%|▏         | 16/859 [00:19<17:12,  1.22s/it][A
  2%|▏         | 17/859 [00:20<17:10,  1.22s/it][A
  2%|▏         | 18/859 [00:22<17:07,  1.22s/it][A
  2%|▏         | 19/859 [00:2

Checkpoint saved at epoch 3





In [17]:
# Evaluation
model.eval()
all_targets = []
all_outputs = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits

        all_targets.extend(targets.cpu().numpy())
        all_outputs.extend(logits.argmax(dim=1).cpu().numpy())

100%|██████████| 111/111 [00:50<00:00,  2.20it/s]


In [18]:
# Print classification report
print(classification_report(all_targets, all_outputs, target_names=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative       0.77      0.81      0.79      1001
     neutral       0.78      0.72      0.75      1430
    positive       0.82      0.85      0.84      1103

    accuracy                           0.79      3534
   macro avg       0.79      0.80      0.79      3534
weighted avg       0.79      0.79      0.79      3534

