# Imports

In [17]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Dataset Preparation

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenize text and return tensors
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label)
        }

In [4]:
df = pd.read_csv("twitter_validation.csv", header=None)

In [10]:
df = df.drop([0,1], axis="columns")

In [12]:
df = df.dropna()

In [13]:
df.columns = ['labels', 'text']

In [14]:
df

Unnamed: 0,labels,text
0,Irrelevant,I mentioned on Facebook that I was struggling ...
1,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,Negative,@Microsoft Why do I pay for WORD when it funct...
3,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,Neutral,Now the President is slapping Americans in the...
...,...,...
995,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,Positive,Today sucked so it’s time to drink wine n play...
998,Positive,Bought a fraction of Microsoft today. Small wins.


In [25]:
label_dict = {'Positive': 0, 'Negative': 1, 'Neutral': 2, 'Irrelevant': 3}
df['labels'] = df['labels'].map(label_dict)

In [26]:
df_train, df_val = train_test_split(df, test_size=0.1, random_state=42)

In [27]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [28]:
train_dataset = TextDataset(df_train['text'].tolist(), df_train['labels'].tolist(), tokenizer)
val_dataset = TextDataset(df_val['text'].tolist(), df_val['labels'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Loading Model

In [16]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Optionally, set up a learning rate scheduler
scheduler = StepLR(optimizer, step_size=1000, gamma=0.95)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [29]:
# Define the training and validation functions
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(data_loader)

In [30]:
# Training the model
for epoch in range(3):  # number of epochs
    print(f'Epoch {epoch + 1}')
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = eval_model(model, val_loader, device)
    print(f'Train Loss: {train_loss}')
    print(f'Validation Loss: {val_loss}')
    scheduler.step()  # Update the learning rate

Epoch 1


100%|███████████████████████████████████████████████████████████████████████████████████| 57/57 [00:12<00:00,  4.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.50it/s]


Train Loss: 1.3273325091914128
Validation Loss: 1.1927730526242937
Epoch 2


100%|███████████████████████████████████████████████████████████████████████████████████| 57/57 [00:13<00:00,  4.33it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 14.15it/s]


Train Loss: 1.0794106115374649
Validation Loss: 1.183968416282109
Epoch 3


100%|███████████████████████████████████████████████████████████████████████████████████| 57/57 [00:13<00:00,  4.31it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 14.79it/s]

Train Loss: 0.7206382903090695
Validation Loss: 1.2321644680840629





# Evaluation

In [31]:
# Assuming model is loaded and set to evaluation mode
model.eval()
model.to(device)

predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.57
Precision: 0.5599780219780219
Recall: 0.57
F1 Score: 0.5555990004997501


# Predict Single Sample

In [33]:
# Sample text
text = "Here is the text I want to classify"

# Tokenize the text
inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Move the tensor to the same device as the model
inputs = {k: v.to(device) for k, v in inputs.items()}

In [36]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

predicted_class_idx = probabilities.argmax().item()

In [37]:
classes = ['Positive', 'Negative', 'Neutral', 'Irrelevant']

In [38]:
classes[predicted_class_idx]

'Positive'