#  BERT-based Sentiment Classification using PyTorch

In [None]:
!pip install transformers
!pip install nltk

In [None]:
import pandas as pd
import numpy as np
import re
import torch
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

nltk.download('punkt')

## Load Dataset

In [None]:
train = pd.read_csv("/content/drive/MyDrive/NLP/train_2kmZucJ.csv")
test = pd.read_csv("/content/drive/MyDrive/NLP/test_oJQbWVk.csv")
sample_sub = pd.read_csv("/content/drive/MyDrive/NLP/sample_submission_LnhVWA4.csv")

##  Data Cleaning

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[\$\&\@\*\#]', '', text)
    return text.strip()

train['clean_tweet'] = train['tweet'].apply(clean_text)
test['clean_tweet'] = test['tweet'].apply(clean_text)

##  Train-Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train['clean_tweet'], train['label'], 
    test_size=0.2, 
    stratify=train['label'], 
    random_state=42
)

##  Tokenization using BERT Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

##  Custom Dataset Class

In [None]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

##  Create Datasets & DataLoaders

In [None]:
train_dataset = TweetDataset(X_train.tolist(), y_train.tolist())
val_dataset = TweetDataset(X_val.tolist(), y_val.tolist())
test_dataset = TweetDataset(test['clean_tweet'].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

##  Load BERT Model

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

##  Optimizer & Scheduler

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
num_training_steps = epochs * len(train_loader)

scheduler = get_scheduler(
    "linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

##  Model Training

In [None]:
model.train()
for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

##  Validation & Evaluation

In [None]:
model.eval()
preds = []
true = []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
        true.extend(batch['labels'].cpu().numpy())

print("Validation Weighted F1 Score:", f1_score(true, preds, average='weighted'))

##  Predict on Test Set

In [None]:
test_preds = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        test_preds.extend(torch.argmax(logits, axis=1).cpu().numpy())

##  Create Submission File

In [None]:
submission = pd.DataFrame({
    'id': test['id'],
    'label': test_preds
})
submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' created.")