In [None]:
# %pip install -q transformers datasets scikit-learn torch

In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import pandas as pd

In [3]:
# Load your separated CSV files
train_df = pd.read_csv('../../../data/train_data.csv')
val_df = pd.read_csv('../../../data/val_data.csv')
test_df = pd.read_csv('../../../data/test_data.csv')

# Example columns: 'text' (input), 'label' (0=Negative, 1=Neutral, 2=Positive)
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

train_df['label'] = train_df['Analysis'].map(label_mapping)
val_df['label'] = val_df['Analysis'].map(label_mapping)
test_df['label'] = test_df['Analysis'].map(label_mapping)


In [4]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class SentimentDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Dataset instances
train_dataset = SentimentDataset(train_df)
val_dataset = SentimentDataset(val_df)
test_dataset = SentimentDataset(test_df)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
DistilBert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DistilBert_model.to(device)
optimizer = AdamW(DistilBert_model.parameters(), lr=2e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
epochs = 3
for epoch in range(epochs):
    DistilBert_model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        print(type(batch))
        print(batch)
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = DistilBert_model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader)}")

  0%|          | 0/283 [00:00<?, ?it/s]

<class 'dict'>
{'input_ids': tensor([[ 101, 2064, 2017,  ...,    0,    0,    0],
        [ 101, 2748, 1996,  ..., 2002, 1005,  102],
        [ 101, 1022, 2420,  ...,    0,    0,    0],
        ...,
        [ 101, 2023, 2036,  ...,    0,    0,    0],
        [ 101, 2001, 2016,  ...,    0,    0,    0],
        [ 101, 1045, 2123,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 2, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 2, 2, 2, 1, 1, 2, 2, 0, 1,
        2, 2, 1, 2, 2, 2, 2, 2])}


Epoch 0:   0%|          | 1/283 [00:13<1:02:46, 13.36s/it, loss=1.09]

<class 'dict'>
{'input_ids': tensor([[ 101, 2024, 2045,  ...,    0,    0,    0],
        [ 101, 1008, 1008,  ...,    0,    0,    0],
        [ 101, 4283, 1045,  ...,    0,    0,    0],
        ...,
        [ 101, 2004, 2619,  ...,    0,    0,    0],
        [ 101, 2021, 2498,  ...,    0,    0,    0],
        [ 101, 4392, 7167,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 2, 2, 1, 0, 2, 2, 2, 1, 1, 2, 1, 2, 0, 2, 2, 1, 0, 1, 0, 0, 2, 0, 1,
        1, 2, 1, 2, 1, 0, 2, 2])}


Epoch 0:   1%|          | 2/283 [00:25<59:27, 12.70s/it, loss=1.09]  


ValueError: text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).

In [None]:
# model.eval()
# test_preds = []
# test_labels = []

# with torch.no_grad():
#     for batch in test_loader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         logits = outputs.logits
#         preds = torch.argmax(logits, dim=1).cpu().numpy()
#         labels = batch['labels'].cpu().numpy()

#         test_preds.extend(preds)
#         test_labels.extend(labels)

# print("\nTest Accuracy:", accuracy_score(test_labels, test_preds))
# print("\nTest Report:\n", classification_report(test_labels, test_preds, target_names=['Negative', 'Neutral', 'Positive']))
