In [74]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.svm import SVC

import torch
from transformers import BertTokenizer, BertModel

from datasets import load_dataset
from sklearnex import patch_sklearn
from warnings import filterwarnings
import mlflow
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
filterwarnings("ignore")

In [4]:
dataset = "ag_news"
data = load_dataset(dataset)

In [5]:
shuffled_train = data["train"].shuffle(seed=42)
x_train = shuffled_train['text']
y_train = shuffled_train['label']
    
x_test = data['test']['text']
y_test = data['test']['label']

In [6]:
xtr = x_train
xte = x_test

In [7]:
print(xtr[0])

Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.


In [8]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [9]:
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [13]:
def process_texts(texts, tokenizer, model, device, batch_size=32):
    model.eval() 
    all_embeddings = [0] * len(texts)
    
    with torch.no_grad(): 
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            
            inputs = tokenizer(
                batch, 
                return_tensors='pt', 
                padding=True, 
                truncation=True, 
            ).to(device)
            
            outputs = model(**inputs)
            
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            
            cls_embeddings = cls_embeddings.cpu()
            
            for j in range(i, i + len(batch)):
                all_embeddings[j] = cls_embeddings[j - i]
                
            
    return all_embeddings

In [16]:
embeddings = process_texts(xtr, tokenizer, model, device)

In [104]:
#torch.save(embeddings, 'embeddings.pt')
emb = torch.load('embeddings.pt')

### SVM

In [19]:
clf = SVC()
clf.fit(emb, y_train)

In [20]:
test_embeddings = process_texts(xte, tokenizer, model, device)

In [21]:
y_pred = clf.predict(test_embeddings)

In [22]:
score = f1_score(y_test, y_pred, average='macro')
print(score)

0.9130979398047535


### NN

In [23]:
from sklearn.neural_network import MLPClassifier

In [75]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="BERT+MLP")

2025/05/18 16:08:26 INFO mlflow.tracking.fluent: Experiment with name 'BERT+MLP' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/193229799560925795', creation_time=1747573706523, experiment_id='193229799560925795', last_update_time=1747573706523, lifecycle_stage='active', name='BERT+MLP', tags={}>

In [103]:
hls = (100,10)
activation = 'logistic'
max_iter = 20

mlflow.start_run()

NN = MLPClassifier(hidden_layer_sizes=hls, activation=activation, solver='adam', max_iter=max_iter, random_state=42)
NN.fit(emb, y_train)

nn_y_pred = NN.predict(test_embeddings)

nn_score = f1_score(y_test, nn_y_pred, average='macro')

mlflow.log_param('hidden_layer_sizes', hls)
mlflow.log_param('activation', activation)
mlflow.log_param('max_iter', max_iter)
mlflow.log_metric('macro_score', nn_score)

mlflow.end_run()

🏃 View run hilarious-panda-444 at: http://127.0.0.1:5000/#/experiments/193229799560925795/runs/6fac7296464a433da1dd91925e500e9d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/193229799560925795
