In [1]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.svm import SVC

import torch
from transformers import DistilBertTokenizer, DistilBertModel

from datasets import load_dataset
from sklearnex import patch_sklearn
from warnings import filterwarnings
import mlflow
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
filterwarnings("ignore")

In [4]:
dataset = "imdb"
data = load_dataset(dataset)

In [5]:
shuffled_train = data["train"].shuffle(seed=42)
x_train = shuffled_train['text']
y_train = shuffled_train['label']
    
x_test = data['test']['text']
y_test = data['test']['label']

In [6]:
xtr = x_train
xte = x_test

In [7]:
print(xtr[0])

There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...


In [9]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained("./distilbert_imdb_mlm")

In [10]:
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [11]:
text = "Your input text goes here."
tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [12]:
with torch.no_grad():
    tokens.to(device)
    outputs = model(**tokens)
    cls_embedding = outputs.last_hidden_state[:, 0, :] # Get the embedding for the [CLS] token

In [13]:
print(cls_embedding.shape)

torch.Size([1, 768])


In [14]:
# Функция для обработки батча текстов
def process_texts(texts, tokenizer, model, device, batch_size=32):
    model.eval()  # Переводим модель в режим оценки
    all_embeddings = [0] * len(texts)
    
    with torch.no_grad():  # Отключаем вычисление градиентов для ускорения
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            
            # Токенизация и перенос на устройство
            inputs = tokenizer(
                batch, 
                return_tensors='pt', 
                padding=True, 
                truncation=True, 
            ).to(device)
            
            # Получаем эмбеддинги
            outputs = model(**inputs)
            
            # Используем эмбеддинги из [CLS] токена
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            
            # Переносим обратно на CPU
            cls_embeddings = cls_embeddings.cpu()
            
            for j in range(i, i + len(batch)):
                all_embeddings[j] = cls_embeddings[j - i]
                
            
    return all_embeddings

In [15]:
# Обрабатываем данные
embeddings = process_texts(xtr, tokenizer, model, device)

In [17]:
torch.save(embeddings, 'embeddings.pt')
emb = torch.load('embeddings.pt')

### SVM

In [20]:
clf = SVC()
clf.fit(emb, y_train)

In [21]:
test_embeddings = process_texts(xte, tokenizer, model, device)

In [22]:
torch.save(test_embeddings, 'test_embeddings.pt')

In [23]:
y_pred = clf.predict(test_embeddings)

In [24]:
score = f1_score(y_test, y_pred)
print(score)

0.8967643761012334


### NN

In [25]:
from sklearn.neural_network import MLPClassifier

In [26]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="Distil-BERT+MLP")

KeyboardInterrupt: 

In [45]:
hls = (200,)
activation = 'logistic'
max_iter = 15

NN = MLPClassifier(hidden_layer_sizes=hls, activation=activation, solver='adam', max_iter=max_iter, random_state=42)
NN.fit(emb, y_train)

nn_y_pred = NN.predict(test_embeddings)

nn_score = f1_score(y_test, nn_y_pred)
print(nn_score)

0.8965462610899874


In [103]:
hls = (100,10)
activation = 'relu'
max_iter = 20

mlflow.start_run()

NN = MLPClassifier(hidden_layer_sizes=hls, activation=activation, solver='adam', max_iter=max_iter, random_state=42)
NN.fit(emb, y_train)

nn_y_pred = NN.predict(test_embeddings)

nn_score = f1_score(y_test, nn_y_pred, average='macro')

mlflow.log_param('hidden_layer_sizes', hls)
mlflow.log_param('activation', activation)
mlflow.log_param('max_iter', max_iter)
mlflow.log_metric('macro_score', nn_score)

mlflow.end_run()

🏃 View run hilarious-panda-444 at: http://127.0.0.1:5000/#/experiments/193229799560925795/runs/6fac7296464a433da1dd91925e500e9d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/193229799560925795
