In [7]:
from datasets import load_dataset
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
import tensorflow_hub as hub

from sklearn.svm import SVC
from sklearnex import patch_sklearn
from warnings import filterwarnings
import mlflow

patch_sklearn()
filterwarnings("ignore")

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [3]:
dataset = "ag_news"
data = load_dataset(dataset)

In [4]:
shuffled_train = data["train"].shuffle(seed=42)
x_train = shuffled_train['text']
y_train = shuffled_train['label']
    
x_test = data['test']['text']
y_test = data['test']['label']

In [13]:
print(x_train[0])

Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.


In [10]:
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
use_model = hub.load(model_url)







In [14]:
# Функция для получения эмбеддингов
def get_use_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        emb = use_model(batch).numpy()
        embeddings.append(emb)
    return np.concatenate(embeddings)

In [23]:
x_train_emb = get_use_embeddings(x_train)

In [24]:
print(x_train_emb.shape)

(120000, 512)


In [25]:
x_test_emb = get_use_embeddings(x_test)

### SVM

In [26]:
# Обучение LogisticRegression
clf = SVC()
clf.fit(x_train_emb, y_train)

In [27]:
# Предсказание и оценка
svm_y_pred = clf.predict(x_test_emb)
score = f1_score(y_test, svm_y_pred, average="macro")

print(f"f1 score = {score}")

f1 score = 0.9178367692663927


### NN

In [28]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="USE+MLP")

2025/05/18 19:14:34 INFO mlflow.tracking.fluent: Experiment with name 'USE+MLP' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/770262264377545605', creation_time=1747584874995, experiment_id='770262264377545605', last_update_time=1747584874995, lifecycle_stage='active', name='USE+MLP', tags={}>

In [41]:
hls = (256,32)
activation = 'relu'
max_iter = 30

mlflow.start_run()

NN = MLPClassifier(hidden_layer_sizes=hls, activation=activation, solver='adam', max_iter=max_iter, random_state=42)
NN.fit(x_train_emb, y_train)

nn_y_pred = NN.predict(x_test_emb)

nn_score = f1_score(y_test, nn_y_pred, average='macro')

mlflow.log_param('hidden_layer_sizes', hls)
mlflow.log_param('activation', activation)
mlflow.log_param('max_iter', max_iter)
mlflow.log_metric('macro_score', nn_score)

mlflow.end_run()

🏃 View run kindly-deer-844 at: http://127.0.0.1:5000/#/experiments/770262264377545605/runs/88331ecf18c74ed8baf2d34468522c80
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/770262264377545605
