In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset

from Architectures.Basic_Sequence_classification import Sequence_Classification

In [2]:
from transformers import AutoTokenizer
# summary
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

## Loading Tweet Dataset

In [None]:
dataset = load_dataset("tweet_eval", "emotion", token="")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", token="")

NUM_LABELS = 6  # anger, joy, optimism, sadness, fear, love
MAX_LEN = 128

In [4]:
dataset['train'][10]

{'text': '@user @user USA was embarrassing to watch. When was the last time you guys won a game..? #horrible #joke',
 'label': 0}

## Preprocess

In [5]:
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=MAX_LEN)

encoded_dataset = dataset.map(encode)

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

In [6]:
def encode_message(string):
    return tokenizer(string, truncation=True, padding='max_length', max_length=MAX_LEN)

In [7]:
encode_message("Hello world").keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [8]:
class TweetDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.dataset[idx]['input_ids']),
            'attention_mask': torch.tensor(self.dataset[idx]['attention_mask']),
        }
        item['labels'] = torch.tensor(self.dataset[idx]['label'])
        return item

train_dataset = TweetDataset(encoded_dataset['train'])
val_dataset = TweetDataset(encoded_dataset['validation'])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

## Model Definition and Training

In [22]:
from transformers import PretrainedConfig
class Config(PretrainedConfig):
    model_type = "transformer"
    def __init__(self,
                vocab_size = 30522,
                embed_dim=128,
                num_layers=10,
                num_heads=8,
                ff_dim=512,
                pre_normalization=True,
                max_position_embeddings=128,
                dropout_prob=0.1,
                num_labels=6,
                device = "cuda"):
        self.device = device
        self.vocab_size = vocab_size  # Tokenizer vocab size
        self.embed_dim = embed_dim  # Embedding & input to attention
        self.num_layers = num_layers  # Number of encoder layers
        self.num_heads = num_heads  # Number of heads in Multi-Head Attention
        self.ff_dim = ff_dim  # Feed Forward hidden dimension
        self.pre_normalization = pre_normalization  # LayerNorm before or after attention/FFN
        self.max_length = max_position_embeddings  # Max sequence length
        self.dropout_prob = dropout_prob  # Dropout probability
        self.num_classess = num_labels  # Output classes (for classification)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
class Config:
    num_layers = 10
    embed_dim = 128
    vocab_size = tokenizer.vocab_size
    max_length = 128
    device = device # cuda or cpu
    n_heads = 8
    ff_dim = 512
    dropout = 0.1
    num_classess = 6

In [23]:
from transformers import PreTrainedModel
class Classification_model(PreTrainedModel):
    config_class = Config

    def __init__(self, config):
        super(Classification_model, self).__init__(config)
        self.model = Sequence_Classification(config)
    
    def forward(self, input_ids, attention_mask=None):
        return self.model(input_ids, attention_mask=attention_mask)

In [21]:
tokenizer.vocab_size

30522

In [24]:
model = Classification_model(config=Config(vocab_size=tokenizer.vocab_size)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
loss_fn = nn.CrossEntropyLoss()

In [18]:
print(model)

Classification_model(
  (model): Sequence_Classification(
    (position_embedding): SinusoidalEmbeddingLayer(
      (embedding): Embedding(30522, 128)
      (layer_norm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-9): 10 x TransformerEncoderLayer(
          (attn): MultiHeadAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=128, out_features=128, bias=True)
            (output): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (ff): FeedForward2(
            (fc1): Linear(in_features=128, out_features=512, bias=True)
            (fc2): Linear(in_features=512, out_features=128, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (gelu): GELU(approximate

In [25]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Get batch data
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        
        # Calculate loss, backward pass, and update weights
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")
    
    # Validation
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=1)
            
            preds.extend(predictions.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='macro')
    print(f"Validation Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")


Epoch 1, Loss: 1.3207
Validation Accuracy: 0.4385, F1 Score: 0.2120
Epoch 2, Loss: 1.2289
Validation Accuracy: 0.4813, F1 Score: 0.3544
Epoch 3, Loss: 1.0924
Validation Accuracy: 0.5428, F1 Score: 0.3800
Epoch 4, Loss: 0.9219
Validation Accuracy: 0.5802, F1 Score: 0.5239
Epoch 5, Loss: 0.7787
Validation Accuracy: 0.5668, F1 Score: 0.4953
Epoch 6, Loss: 0.6604
Validation Accuracy: 0.5829, F1 Score: 0.5297
Epoch 7, Loss: 0.5271
Validation Accuracy: 0.6176, F1 Score: 0.5561
Epoch 8, Loss: 0.4027
Validation Accuracy: 0.6070, F1 Score: 0.5296
Epoch 9, Loss: 0.3157
Validation Accuracy: 0.5401, F1 Score: 0.4864
Epoch 10, Loss: 0.2627
Validation Accuracy: 0.6283, F1 Score: 0.5453


In [26]:
model.save_pretrained("tweet_emotion_model")

In [28]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os
from huggingface_hub import HfApi

api = HfApi(token=os.getenv(""))
api.upload_folder(
    folder_path="tweet_emotion_model",
    repo_id="Se00n00/Sequence_classifier",
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/23.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Se00n00/Sequence_classifier/commit/7bcc5949e4e13371349598e9939d1b1172f73fbd', commit_message='Upload folder using huggingface_hub', commit_description='', oid='7bcc5949e4e13371349598e9939d1b1172f73fbd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Se00n00/Sequence_classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='Se00n00/Sequence_classifier'), pr_revision=None, pr_num=None)

In [29]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(name="Sequence-classifier", private=False)

model.push_to_hub("Se00n00/Sequence-classifier")

TypeError: HfApi.create_repo() got an unexpected keyword argument 'name'

In [20]:
model.predict = lambda x: torch.argmax(model(x), dim=1)

In [38]:
def predict(string):
    encoded_message = encode_message(string)

    input_ids = torch.tensor(encoded_message['input_ids'], device=device).unsqueeze(0)
    attention_mask = torch.tensor(encoded_message['attention_mask'], device=device).unsqueeze(0)

    label = model(input_ids, attention_mask)
    
    return label

In [48]:
predict("Everything is falling apart and I can’t stop it.")

tensor([[-0.4254,  5.8552, -2.4696, -1.3665, -9.9205, -8.3114]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [49]:
torch.save(model.state_dict(), 'best_model.pt')
