In [1]:
from datasets import load_dataset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import tensorflow as tf
import tensorflow_hub as hub
from nltk.tokenize import word_tokenize
import re
import string




In [2]:
def preprocess_text(text):
    # Приведение к нижнему регистру
    text = text.lower()
    
    # Удаление HTML-тегов
    text = re.sub(r'<[^>]+>', '', text)
    
    # Удаление цифр
    text = re.sub(r'\d+', '', text)
    
    # Удаление пунктуации
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Токенизация
    tokens = word_tokenize(text)
    
    # Удаление пустых токенов
    tokens = [token for token in tokens if token.strip()]
    
    return tokens

In [3]:
# Загрузка датасета
dataset = load_dataset("ag_news")
train_data = dataset["train"]
test_data = dataset["test"]

Using the latest cached version of the dataset since ag_news couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\R1sed\.cache\huggingface\datasets\ag_news\default\0.0.0\eb185aade064a813bc0b7f42de02595523103ca4 (last modified on Wed Feb 12 18:02:31 2025).


In [29]:
# Подготовка данных
X_train = train_data["text"][:5000]  # Берем подмножество для экономии времени
X_test = test_data["text"][:500]
y_train = train_data["label"][:5000]
y_test = test_data["label"][:500]

In [30]:
X_train_tokenized = [preprocess_text(text) for text in X_train]
X_test_tokenized = [preprocess_text(text) for text in X_test]

In [31]:
print(X_train[0])

print(len(X_train_tokenized[0]))
print(X_train_tokenized[0])

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
20
['wall', 'st', 'bears', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'shortsellers', 'wall', 'streets', 'dwindlingband', 'of', 'ultracynics', 'are', 'seeing', 'green', 'again']


In [32]:
# Загрузка ELMo
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Функция для получения эмбеддингов
def get_elmo_embeddings(texts):
    return elmo.signatures["default"](tf.constant(texts))["default"]

In [33]:
# Эмбеддинги слов
x0 = get_elmo_embeddings(X_train_tokenized[0])
print(x0)

tf.Tensor(
[[ 0.03788491  0.27737054 -0.6391735  ... -0.11237262  0.434619
   0.46560812]
 [ 0.5537806   0.22365248 -0.1768304  ... -0.41267446 -0.05352611
   0.59317315]
 [ 0.25039792 -0.0692119   0.5968957  ...  0.30382812  0.72532606
   0.5239556 ]
 ...
 [ 0.0783847   0.08138861 -0.38448554 ...  0.40558958 -0.01340143
   0.33473802]
 [ 0.0835043   0.01787048  0.11163118 ...  0.5311339   0.13082975
   0.06012309]
 [ 0.04651073  0.399118    0.03651373 ... -0.2595333  -0.17497617
   0.36145782]], shape=(20, 1024), dtype=float32)


In [34]:
print(tf.reduce_sum(x0, axis=0))

tf.Tensor(
[-0.65209734  1.742501    3.2368598  ...  2.409593    3.3246903
  6.907042  ], shape=(1024,), dtype=float32)


In [35]:
# Эмбеддинг текста
get_elmo_embeddings([X_train[0]])

<tf.Tensor: shape=(1, 1024), dtype=float32, numpy=
array([[-0.08138489, -0.22403522,  0.06073161, ..., -0.11337185,
         0.397245  , -0.0678893 ]], dtype=float32)>

In [36]:
# Получение эмбеддингов
def get_elmo_embeddings(texts, batch_size=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        emb = elmo.signatures["default"](tf.constant(batch))["default"]
        embeddings.append(emb.numpy())
    return np.concatenate(embeddings)

In [37]:
X_train_elmo = get_elmo_embeddings(X_train)

In [38]:
X_test_elmo = get_elmo_embeddings(X_test)

In [40]:
# Обучение LogisticRegression
model = LogisticRegression(max_iter=500)
model.fit(X_train_elmo, y_train)

# Предсказание и оценка
y_pred = model.predict(X_test_elmo)
f1 = f1_score(y_test, y_pred, average='macro')

print(f"F1-score (ELMo + LogisticRegression): {f1:.4f}") #для 1000 было 0,81

F1-score (ELMo + LogisticRegression): 0.8847


In [59]:
"""
sentences = [
    "I deposited money in the bank.",  # Финансовый "bank"
    "The boat sank near the bank."     # Речной "bank"
]"""

In [62]:
"""# Получение эмбеддингов (включая токенизацию)
outputs = elmo.signatures["default"](tf.constant(sentences))
embeddings = outputs["word_emb"]  # Тензор формы [2, max_length, 1024]

print(embeddings)"""

tf.Tensor(
[[[ 0.69227165 -0.32613114  0.22827524 ...  0.17574832  0.26598704
   -0.10131966]
  [-0.1733603   0.6513147   0.46616495 ... -0.02883309  0.04836473
   -0.08957538]
  [ 0.27699882 -0.3316144   0.00927745 ... -0.26881605 -0.36677992
    0.06983681]
  [-0.25692382  0.4148302  -0.41116154 ... -0.1983385  -0.02607471
    0.16813873]
  [-0.06904853  0.1126154   0.23713255 ...  0.08062498  0.09338938
   -0.18999371]
  [ 0.05600581 -0.03165206 -1.3951159  ... -0.81375486 -0.39511025
   -1.1125754 ]]

 [[-0.32884765  0.20216238 -0.5940115  ...  0.17375435  0.13352706
    0.2505604 ]
  [ 0.05043833 -0.1829463  -0.10682116 ... -0.06665179  0.49769396
    0.11925097]
  [-0.65117186 -0.2416502  -0.36178517 ...  0.14649315  0.4681933
    0.8126171 ]
  [-0.06230947 -0.27355105 -0.66885203 ... -0.4796688  -0.52628994
   -0.17827176]
  [-0.06904853  0.1126154   0.23713255 ...  0.08062498  0.09338938
   -0.18999371]
  [ 0.05600581 -0.03165206 -1.3951159  ... -0.81375486 -0.39511025
   -1.11

In [64]:
# Находим позиции "bank" вручную (если нужно)
"""tokens_list = [
    ["[CLS]"] + sentence.split() + ["[SEP]"] 
    for sentence in sentences
]

print(tokens_list)"""

[['[CLS]', 'I', 'deposited', 'money', 'in', 'the', 'bank.', '[SEP]'], ['[CLS]', 'The', 'boat', 'sank', 'near', 'the', 'bank.', '[SEP]']]


In [67]:
"""positions = [
    tokens_list[i].index("bank.") for i in range(len(sentences))
]
print(positions)"""

[6, 6]


In [83]:
"""# Извлечение эмбеддингов для "bank"
bank_financial = embeddings[0, positions[0] - 1]
bank_river = embeddings[1, positions[1] - 1]"""

In [84]:
"""# Косинусная близость
cosine_sim = tf.reduce_sum(
    tf.nn.l2_normalize(bank_financial, axis=0) * 
    tf.nn.l2_normalize(bank_river, axis=0)
).numpy()

print(f"Косинусная близость: {cosine_sim:.3f}")"""

Косинусная близость: 1.000
