In [1]:
from datasets import load_dataset
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
import nltk

In [2]:
# Загрузка токенизатора NLTK (если не установлен)
nltk.download('punkt')

# Загрузка датасета AG News
dataset = load_dataset("ag_news")
train_data = dataset["train"]
test_data = dataset["test"]

# Токенизация текстов
X_train = [word_tokenize(text.lower()) for text in train_data["text"]]
X_test = [word_tokenize(text.lower()) for text in test_data["text"]]
y_train = train_data["label"]
y_test = test_data["label"]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\R1sed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Параметры Word2Vec
vector_size = 100  # Размерность вектора
window = 5         # Размер окна контекста
min_count = 2      # Минимальная частота слова

# Обучение модели Word2Vec
word2vec_model = Word2Vec(
    sentences=X_train,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=4
)

In [4]:
def text_to_vector(tokens, model, vector_size):
    # Инициализируем нулевой вектор
    vector = np.zeros(vector_size)
    count = 0
    
    # Суммируем векторы всех слов в тексте
    for word in tokens:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    
    # Усредняем (чтобы избежать влияния длины текста)
    if count > 0:
        vector /= count
    
    return vector

# Преобразуем все тексты в векторы
X_train_vec = np.array([text_to_vector(text, word2vec_model, vector_size) for text in X_train])
X_test_vec = np.array([text_to_vector(text, word2vec_model, vector_size) for text in X_test])

In [5]:
# Обучение LogisticRegression
model = LogisticRegression(max_iter=500)
model.fit(X_train_vec, y_train)

# Предсказание и оценка
y_pred = model.predict(X_test_vec)
f1 = f1_score(y_test, y_pred, average='macro')

print(f"F1-score (Word2Vec + LogisticRegression): {f1:.4f}")

F1-score (Word2Vec + LogisticRegression): 0.8772


In [43]:
word = "space"
similar_words = word2vec_model.wv.most_similar(
                positive=[word],
                topn=5,
                restrict_vocab=None
            )

In [44]:
print(f"\nСлово: '{word}'")
print("Топ-5 ближайших:")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity:.3f}")


Слово: 'space'
Топ-5 ближайших:
spacecraft: 0.826
orbit: 0.786
nasa: 0.784
craft: 0.769
shuttle: 0.767
