In [7]:
import numpy as np

In [1]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
train_texts, test_texts = dataset["train"]["text"], dataset["test"]["text"]
train_labels, test_labels = dataset["train"]["label"], dataset["test"]["label"]

In [10]:
import re
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # 1. Удаляем HTML-теги и спецсимволы
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # 2. Нормализация: нижний регистр + замена чисел
    text = text.lower()
    text = re.sub(r'\d+', 'NUM', text)
    
    # 3. Токенизация без удаления стоп-слов
    tokens = word_tokenize(text, language='english')  # Для русского используйте 'russian'
    return ' '.join(tokens)

# Пример
text = "The <b>cat</b> and 3 dogs ran in the park!"
print(preprocess_text(text)) 

the cat and NUM dogs ran in the park


In [2]:
from sklearn.metrics import f1_score

def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro")

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Предобработка всех текстов
train_texts_clean = [preprocess_text(text) for text in train_texts]
test_texts_clean = [preprocess_text(text) for text in test_texts]

# Векторизация
tfidf = TfidfVectorizer(max_features=10_000)
X_train = tfidf.fit_transform(train_texts_clean)
X_test = tfidf.transform(test_texts_clean)

# Обучение модели
model = LogisticRegression(max_iter=500)
model.fit(X_train, train_labels)

In [12]:
tfidf_predictions = model.predict(X_test)
f1_tfidf = f1_score(test_labels, tfidf_predictions, average="macro")
print(f1_tfidf)

0.9127151097261657


In [14]:
from gensim.models import Word2Vec
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

# Токенизация текстов
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

# Паддинг до одинаковой длины
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Обучение Word2Vec
sentences = [preprocess_text(text).split() for text in train_texts]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=3, workers=4)

# Матрица эмбеддингов
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
# Модель LSTM
model = Sequential([
    Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], trainable=False),
    LSTM(128),
    Dense(4, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(X_train_pad, train_labels, epochs=5)

y_pred = model.predict(X_test_pad).argmax(axis=1)
f1_w2v = f1_macro(test_labels, y_pred)

ValueError: Unrecognized data type: x=[[    0     0     0 ...  4049   797   332]
 [    0     0     0 ...     4     1   128]
 [    0     0     0 ...     1  1214 14993]
 ...
 [    0     0     0 ...   346    65   123]
 [    0     0     0 ...    42    16  1666]
 [    0     0     0 ...  2095  3435    72]] (of type <class 'numpy.ndarray'>)

In [None]:
print(f1_w2v)

In [None]:
# Загрузка GloVe (предобученные)
embedding_matrix_glove = np.zeros((vocab_size, 100))
for word, idx in tokenizer.word_index.items():
    if word in glove_embeddings:  # Загруженные заранее
        embedding_matrix_glove[idx] = glove_embeddings[word]

# Та же архитектура, но с GloVe
model = Sequential([
    Embedding(vocab_size, 100, weights=[embedding_matrix_glove], trainable=False),
    LSTM(128),
    Dense(4, activation="softmax")
])
model.fit(X_train_pad, train_labels, epochs=5)

y_pred = model.predict(X_test_pad).argmax(axis=1)
f1_glove = f1_macro(test_labels, y_pred)