In [8]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.svm import SVC

import torch
from transformers import BertTokenizer, BertModel

from datasets import load_dataset
from sklearnex import patch_sklearn
from warnings import filterwarnings
import mlflow
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
filterwarnings("ignore")

In [4]:
dataset = "imdb"
data = load_dataset(dataset)

In [5]:
shuffled_train = data["train"].shuffle(seed=42)
x_train = shuffled_train['text']
y_train = shuffled_train['label']
    
x_test = data['test']['text']
y_test = data['test']['label']

In [6]:
xtr = x_train
xte = x_test

In [7]:
print(xtr[0])

There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...


In [10]:
model_name = 'textattack/bert-base-uncased-imdb'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [11]:
model.to(device)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [15]:
def process_texts(texts, tokenizer, model, device, batch_size=32):
    model.eval()  
    all_embeddings = [0] * len(texts)
    
    with torch.no_grad():  
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            
            inputs = tokenizer(
                batch, 
                return_tensors='pt', 
                padding=True, 
                truncation=True, 
            ).to(device)
            
            outputs = model(**inputs)
            
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            
            cls_embeddings = cls_embeddings.cpu()
            
            for j in range(i, i + len(batch)):
                all_embeddings[j] = cls_embeddings[j - i]
                
            
    return all_embeddings

In [16]:
embeddings = process_texts(xtr, tokenizer, model, device)

In [17]:
torch.save(embeddings, 'embeddings_pretrained.pt')
emb = torch.load('embeddings_pretrained.pt')

### SVM

In [18]:
clf = SVC()
clf.fit(emb, y_train)

In [19]:
test_embeddings = process_texts(xte, tokenizer, model, device)

In [20]:
torch.save(test_embeddings, 'test_embeddings_pretrained.pt')

In [21]:
y_pred = clf.predict(test_embeddings)

In [22]:
score = f1_score(y_test, y_pred)
print(score)

0.9331356916790192


### NN

In [23]:
from sklearn.neural_network import MLPClassifier

In [84]:
hls = (32,8)
activation = 'logistic'
max_iter = 3

NN = MLPClassifier(hidden_layer_sizes=hls, activation=activation, solver='adam', max_iter=max_iter, random_state=42)
NN.fit(emb, y_train)

nn_y_pred = NN.predict(test_embeddings)

nn_score = f1_score(y_test, nn_y_pred)
print(nn_score)

0.934178720031702
