## Name: Soumya Dasgupta
## Roll no: MDS202348

## Part 2

In [None]:
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import os

In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [None]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        target = self.data.sentiment[index]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)  # ensure target is also a long tensor
        }

    def __len__(self):
        return self.len


In [None]:
train_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='unicode_escape');
test_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1');

In [None]:
# Define sentiment mapping
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

# Apply this mapping to your DataFrame
train_df['sentiment'] = train_df['sentiment'].map(sentiment_mapping)
test_df['sentiment'] = test_df['sentiment'].map(sentiment_mapping)


In [None]:
# Print unique values of targets to ensure they are within expected range
print("Unique targets in train dataset:", pd.unique(train_df['sentiment']))
print("Unique targets in test dataset:", pd.unique(test_df['sentiment']))

Unique targets in train dataset: [1 0 2]
Unique targets in test dataset: [ 1.  2.  0. nan]


In [None]:
nan_counts = test_df.isna().sum()
print("Number of NaN values in each column:\n", nan_counts)

Number of NaN values in each column:
 textID              1281
text                1281
sentiment           1281
Time of Tweet       1281
Age of User         1281
Country             1281
Population -2020    1281
Land Area (Km²)     1281
Density (P/Km²)     1281
dtype: int64


In [None]:
test_df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,1.0,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,2.0,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",0.0,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,2.0,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,2.0,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [None]:
# Directly modify the original DataFrame
test_df.dropna(inplace=True)


In [None]:
nan_counts = test_df.isna().sum()
print("Number of NaN values in each column:\n", nan_counts)

Number of NaN values in each column:
 textID              0
text                0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64


In [None]:
test_df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,1.0,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,2.0,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",0.0,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,2.0,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,2.0,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [None]:
# Parameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32

## Using BERT

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Create data loaders
train_dataset = SentimentDataset(train_df, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

test_dataset = SentimentDataset(test_df, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=True)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



In [None]:
# Checkpoint path
CHECKPOINT_PATH = "model_checkpoint.pt"

# Load model and optimizer state if exists
if os.path.isfile(CHECKPOINT_PATH):
    checkpoint = torch.load(CHECKPOINT_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    print(f"Resuming training from epoch {start_epoch+1}")
else:
    start_epoch = 0


In [None]:
# Function to save checkpoint
def save_checkpoint(epoch, model, optimizer, filename=CHECKPOINT_PATH):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved at epoch {epoch+1}")

In [None]:
train_losses = []

# Training Function
def train(epoch):
    model.train()
    running_loss = 0.0
    for _, data in tqdm(enumerate(train_loader, 0)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['targets'].to(device)

        outputs = model(ids, mask, token_type_ids, labels=targets)
        loss = outputs[0]

        if _ % 5000 == 0:
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    average_loss = running_loss / len(train_loader)
    train_losses.append(average_loss)
    save_checkpoint(epoch, model, optimizer)

In [None]:
# Run training from the last saved epoch
for epoch in tqdm(range(start_epoch, 3)):
    train(epoch)


  0%|          | 0/3 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

Epoch: 0, Loss: 1.115517497062683



1it [00:00,  1.30it/s][A
2it [00:01,  1.31it/s][A
3it [00:02,  1.31it/s][A
4it [00:03,  1.31it/s][A
5it [00:03,  1.32it/s][A
6it [00:04,  1.32it/s][A
7it [00:05,  1.32it/s][A
8it [00:06,  1.32it/s][A
9it [00:06,  1.32it/s][A
10it [00:07,  1.32it/s][A
11it [00:08,  1.32it/s][A
12it [00:09,  1.32it/s][A
13it [00:09,  1.31it/s][A
14it [00:10,  1.31it/s][A
15it [00:11,  1.31it/s][A
16it [00:12,  1.31it/s][A
17it [00:12,  1.31it/s][A
18it [00:13,  1.31it/s][A
19it [00:14,  1.31it/s][A
20it [00:15,  1.31it/s][A
21it [00:15,  1.31it/s][A
22it [00:16,  1.31it/s][A
23it [00:17,  1.31it/s][A
24it [00:18,  1.31it/s][A
25it [00:19,  1.31it/s][A
26it [00:19,  1.31it/s][A
27it [00:20,  1.30it/s][A
28it [00:21,  1.30it/s][A
29it [00:22,  1.31it/s][A
30it [00:22,  1.30it/s][A
31it [00:23,  1.30it/s][A
32it [00:24,  1.30it/s][A
33it [00:25,  1.30it/s][A
34it [00:25,  1.31it/s][A
35it [00:26,  1.31it/s][A
36it [00:27,  1.31it/s][A
37it [00:28,  1.31it/s][A
38it [00:

Checkpoint saved at epoch 1



0it [00:00, ?it/s][A

Epoch: 1, Loss: 0.6142678260803223



1it [00:00,  1.29it/s][A
2it [00:01,  1.28it/s][A
3it [00:02,  1.28it/s][A
4it [00:03,  1.28it/s][A
5it [00:03,  1.29it/s][A
6it [00:04,  1.29it/s][A
7it [00:05,  1.28it/s][A
8it [00:06,  1.29it/s][A
9it [00:07,  1.28it/s][A
10it [00:07,  1.28it/s][A
11it [00:08,  1.27it/s][A
12it [00:09,  1.27it/s][A
13it [00:10,  1.27it/s][A
14it [00:10,  1.27it/s][A
15it [00:11,  1.27it/s][A
16it [00:12,  1.27it/s][A
17it [00:13,  1.27it/s][A
18it [00:14,  1.27it/s][A
19it [00:14,  1.27it/s][A
20it [00:15,  1.27it/s][A
21it [00:16,  1.28it/s][A
22it [00:17,  1.28it/s][A
23it [00:18,  1.28it/s][A
24it [00:18,  1.28it/s][A
25it [00:19,  1.28it/s][A
26it [00:20,  1.29it/s][A
27it [00:21,  1.28it/s][A
28it [00:21,  1.28it/s][A
29it [00:22,  1.28it/s][A
30it [00:23,  1.28it/s][A
31it [00:24,  1.28it/s][A
32it [00:25,  1.28it/s][A
33it [00:25,  1.28it/s][A
34it [00:26,  1.28it/s][A
35it [00:27,  1.28it/s][A
36it [00:28,  1.28it/s][A
37it [00:28,  1.28it/s][A
38it [00:

Checkpoint saved at epoch 2



0it [00:00, ?it/s][A

Epoch: 2, Loss: 0.35694393515586853



1it [00:00,  1.29it/s][A
2it [00:01,  1.27it/s][A
3it [00:02,  1.28it/s][A
4it [00:03,  1.28it/s][A
5it [00:03,  1.28it/s][A
6it [00:04,  1.28it/s][A
7it [00:05,  1.28it/s][A
8it [00:06,  1.28it/s][A
9it [00:07,  1.28it/s][A
10it [00:07,  1.28it/s][A
11it [00:08,  1.27it/s][A
12it [00:09,  1.28it/s][A
13it [00:10,  1.28it/s][A
14it [00:10,  1.28it/s][A
15it [00:11,  1.28it/s][A
16it [00:12,  1.28it/s][A
17it [00:13,  1.28it/s][A
18it [00:14,  1.28it/s][A
19it [00:14,  1.28it/s][A
20it [00:15,  1.28it/s][A
21it [00:16,  1.28it/s][A
22it [00:17,  1.28it/s][A
23it [00:17,  1.28it/s][A
24it [00:18,  1.28it/s][A
25it [00:19,  1.28it/s][A
26it [00:20,  1.28it/s][A
27it [00:21,  1.28it/s][A
28it [00:21,  1.28it/s][A
29it [00:22,  1.28it/s][A
30it [00:23,  1.28it/s][A
31it [00:24,  1.28it/s][A
32it [00:25,  1.28it/s][A
33it [00:25,  1.28it/s][A
34it [00:26,  1.28it/s][A
35it [00:27,  1.28it/s][A
36it [00:28,  1.27it/s][A
37it [00:28,  1.28it/s][A
38it [00:

Checkpoint saved at epoch 3





In [None]:
val_losses = []
val_accuracies = []
def evaluate():
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    fin_targets, fin_outputs = [], []
    with torch.no_grad():
        for _, data in tqdm(enumerate(test_loader, 0)):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)
            outputs = model(ids, mask, token_type_ids, labels=targets)

            logits = outputs.logits
            loss = outputs.loss
            running_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            total_predictions += targets.size(0)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    average_loss = running_loss / len(test_loader)
    accuracy = correct_predictions.double() / total_predictions

    val_losses.append(average_loss)
    val_accuracies.append(accuracy.item())
    return fin_targets, fin_outputs



In [None]:
# Get predictions and targets, and print classification report
targets, outputs = evaluate()
print(classification_report(targets, outputs, target_names=['negative','neutral','positive']))

111it [00:30,  3.63it/s]

              precision    recall  f1-score   support

    negative       0.78      0.80      0.79      1001
     neutral       0.76      0.77      0.76      1430
    positive       0.86      0.82      0.84      1103

    accuracy                           0.79      3534
   macro avg       0.80      0.80      0.80      3534
weighted avg       0.80      0.79      0.79      3534






----------------------------------------------------------------------------------