In [1]:
!pip install datasets



In [2]:
# from huggingface_hub import list_datasets

# all_datasets = list(list_datasets())
# print(f"Number of datasets on Hub: {len(all_datasets)}")

In [3]:
!pip install -U datasets



In [4]:
from datasets import load_dataset

emotions = load_dataset('emotion')
print(emotions)

README.md: 0.00B [00:00, ?B/s]

split/train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

split/validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

split/test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [5]:
train_ds = emotions['train']
print(train_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})


In [6]:
train_ds['text']

Column(['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong', 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'i am feeling grouchy'])

In [7]:

emotions.set_format(type='pandas')
df = emotions['train'][:]
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [8]:
def label_int2str(row):
    return emotions['train'].features['label'].int2str(row)

df['label_name'] = df['label'].apply(label_int2str)
df.head()

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [9]:
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader

# --------------------------
# 1. Load Hugging Face Dataset
# --------------------------
emotions = load_dataset("emotion")
train_ds = emotions["train"]
val_ds = emotions["validation"]
test_ds = emotions["test"]

# --------------------------
# 2. Build Vocabulary
# --------------------------
def tokenize(text):
    return text.lower().split()

counter = Counter()
for example in train_ds:
    counter.update(tokenize(example["text"]))

vocab = {w: i+2 for i, (w, _) in enumerate(counter.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

max_len = 20

def encode(text):
    tokens = tokenize(text)
    ids = [vocab.get(t, vocab["<UNK>"]) for t in tokens]
    if len(ids) < max_len:
        ids += [vocab["<PAD>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

# --------------------------
# 3. Preprocess with datasets.map
# --------------------------
def preprocess(batch):
    batch["input_ids"] = [encode(x) for x in batch["text"]]
    return batch


train_ds = train_ds.map(preprocess, batched=True)
val_ds = val_ds.map(preprocess, batched= True)
test_ds = test_ds.map(preprocess, batched=True)


train_ds.set_format(type='torch', columns=['input_ids', 'label'])
val_ds.set_format(type='torch', columns=['input_ids', 'label'])
test_ds.set_format(type='torch', columns=['input_ids', 'label'])


train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

# Define LSTM model

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab['<PAD>'])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)


    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(vocab_size=len(vocab),
                       embed_dim = 100,
                       hidden_dim = 128,
                       output_dim=len(emotions['train'].features['label'].names)).to(device)



#Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-3)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch['input_ids'].to(device), batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")


model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch in val_loader:
        inputs, labels = batch['input_ids'].to(device), batch['label'].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim = 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()

print(f"Validation Accuracy: {correct/total}")


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Epoch 1/5, Loss: 1.5538402740955353
Epoch 2/5, Loss: 1.053005464553833
Epoch 3/5, Loss: 0.5714228547811508
Epoch 4/5, Loss: 0.37129204761981965
Epoch 5/5, Loss: 0.2777424653917551
Validation Accuracy: 0.8215


In [None]:
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler

#Load dataset
emotions = load_dataset('emotion')
train_ds = emotions['train']
val_ds = emotions['validation']
test_ds = emotions['test']

#Tokenizer
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

max_len = 64

def tokenize(batch):
    return tokenizer(batch['text'], padding = 'max_length', truncation = True, max_length = max_len)

train_ds = train_ds.map(tokenize, batched = True)
val_ds = val_ds.map(tokenize, batched = True)
test_ds = test_ds.map(tokenize, batched = True)


cols = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns = cols)
val_ds.set_format(type = 'torch', columns = cols)
test_ds.set_format(type = 'torch', columns = cols)

train_dl = DataLoader(train_ds, shuffle = True, batch_size = 32)
val_dl = DataLoader(val_ds, batch_size = 32)
test_dl = DataLoader(test_ds, batch_size = 32)


#Load pretrained Bert
num_labels = len(emotions['train'].features['label'].names)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels).to(device)


#Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr = 2e-5)

num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler(
    'linear', optimizer = optimizer, num_warmup_steps = 0, num_training_steps = num_training_steps
)


#Training loop
from torch.nn import CrossEntropyLoss

epochs = 3
criterion = CrossEntropyLoss()


for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dl:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
        labels = batch['label'].to(device)

        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dl)}")



model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch in val_dl:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
        labels = batch['label'].to(device)

        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim = 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()

print(f"Validation Accuracy: {correct/total}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.5952516011670231
