In [None]:
!pip install kagglehub transformers

import kagglehub
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AutoModel, BertTokenizerFast, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re




In [None]:
path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")
train_df = pd.read_csv(f'{path}/train.csv', encoding='iso-8859-1')[['text', 'sentiment']]
test_df = pd.read_csv(f'{path}/test.csv', encoding='iso-8859-1')[['text', 'sentiment']]


Downloading from https://www.kaggle.com/api/v1/datasets/download/abhi8923shriv/sentiment-analysis-dataset?dataset_version_number=9...


100%|██████████| 54.4M/54.4M [00:03<00:00, 16.5MB/s]

Extracting files...





In [None]:
def preprocessing(text):
    regex = r'[^\w\s]|[\U0001f600-\U0001f64f\U0001f300-\U0001f5ff\U0001f680-\U0001f6ff\U0001f1e0-\U0001f1ff]'
    text = re.sub(regex, " ", text)
    text = re.sub("\.|\,|\/|\-", " ", text)
    text = re.sub("\s*\s", " ", text)
    return text

train_df["text"] = train_df["text"].astype(str).apply(preprocessing)
train_df.dropna(subset=['text'], inplace=True)
train_df["sentiment"] = train_df["sentiment"].replace({"neutral": 0, "positive": 1, "negative": 2})


  train_df["sentiment"] = train_df["sentiment"].replace({"neutral": 0, "positive": 1, "negative": 2})


In [None]:
train_text, val_text, train_labels, val_labels = train_test_split(
    train_df['text'], train_df['sentiment'], test_size=0.15, random_state=6969, stratify=train_df['sentiment']
)
max_seq_len = max(max(len(i.split()) for i in train_text), max(len(i.split()) for i in val_text))
max_seq_len = min(64, max([len(i.split()) for i in train_text]))

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokens_train = tokenizer.batch_encode_plus(train_text.tolist(), max_length=max_seq_len, pad_to_max_length=True, truncation=True)
tokens_val = tokenizer.batch_encode_plus(val_text.tolist(), max_length=max_seq_len, pad_to_max_length=True, truncation=True)

train_tensorData = TensorDataset(torch.tensor(tokens_train['input_ids']),
                                 torch.tensor(tokens_train['attention_mask']),
                                 torch.tensor(train_labels.tolist()))
val_tensorData = TensorDataset(torch.tensor(tokens_val['input_ids']),
                               torch.tensor(tokens_val['attention_mask']),
                               torch.tensor(val_labels.tolist()))

batch_size = 32
train_dfloader = DataLoader(train_tensorData, sampler=RandomSampler(train_tensorData), batch_size=batch_size)
val_dataloader = DataLoader(val_tensorData, sampler=SequentialSampler(val_tensorData), batch_size=batch_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model = model.to(device)
for param in model.bert.parameters():
    param.requires_grad = False

optimizer = AdamW(model.parameters(), lr=1e-5)

class_wts = compute_class_weight(class_weight="balanced", classes=np.unique(train_labels), y=train_labels)
weights = torch.tensor(class_wts, dtype=torch.float).to(device)
loss_fn = nn.NLLLoss(weight=weights)
epochs = 5


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train():
    model.train()
    total_loss, total_preds = 0, []
    for batch in train_dfloader:
        batch = [r.to(device) for r in batch]
        model.zero_grad()
        sent_id, mask, labels = batch
        output = model(sent_id, attention_mask=mask, labels=labels)
        loss, logits = output.loss, output.logits
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_preds.append(logits.detach().cpu().numpy())
    return total_loss / len(train_dfloader), np.concatenate(total_preds, axis=0)

def evaluate():
    model.eval()
    total_loss, total_preds = 0, []
    for batch in val_dataloader:
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        with torch.no_grad():
            output = model(sent_id, attention_mask=mask, labels=labels)
            loss, logits = output.loss, output.logits
            total_loss += loss.item()
            total_preds.append(logits.detach().cpu().numpy())
    return total_loss / len(val_dataloader), np.concatenate(total_preds, axis=0)


In [None]:
best_valid_loss = float('inf')
train_losses, valid_losses = [], []

for epoch in range(epochs):
    print(f'\nEpoch {epoch+1}/{epochs}')
    train_loss, _ = train()
    valid_loss, _ = evaluate()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'Train Loss: {train_loss:.3f} | Val Loss: {valid_loss:.3f}')



Epoch 1/5
Train Loss: 1.090 | Val Loss: 1.085

Epoch 2/5
Train Loss: 1.083 | Val Loss: 1.080

Epoch 3/5
Train Loss: 1.079 | Val Loss: 1.073

Epoch 4/5
Train Loss: 1.074 | Val Loss: 1.068

Epoch 5/5
Train Loss: 1.069 | Val Loss: 1.063


In [None]:
model.load_state_dict(torch.load('saved_weights.pt'))

test_df["text"] = test_df["text"].astype(str).apply(preprocessing)
test_df.dropna(subset=['text'], inplace=True)
test_df["sentiment"] = test_df["sentiment"].replace({"neutral": 0, "positive": 1, "negative": 2})
test_text = test_df['text']
test_labels = test_df['sentiment'].fillna(0).astype(np.int64)

tokens_test = tokenizer.batch_encode_plus(test_text.tolist(), max_length=25, pad_to_max_length=True, truncation=True)
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist(), dtype=torch.float32)

with torch.no_grad():
    output = model(test_seq.to(device), attention_mask=test_mask.to(device), labels=test_y.to(device).long())
    preds = output.logits.detach().cpu().numpy()

preds = np.argmax(preds, axis=1)
print(classification_report(test_y, preds, target_names=['positive', 'negative', 'neutral']))


  test_df["sentiment"] = test_df["sentiment"].replace({"neutral": 0, "positive": 1, "negative": 2})


              precision    recall  f1-score   support

    positive       0.58      0.97      0.73      2711
    negative       0.51      0.14      0.22      1103
     neutral       0.40      0.00      0.00      1001

    accuracy                           0.58      4815
   macro avg       0.50      0.37      0.32      4815
weighted avg       0.53      0.58      0.46      4815

