In [1]:
!pip install tensorflow



In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Загрузка данных
categories = ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
newsgroups_data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.3, random_state=42)

# Просмотр примеров данных
len(X_train), len(X_test), X_train[:2], y_train[:2]


(2370,
 1017,
 ['Actually if Mr X had something to gain by his claims his\naccount of the events would nmot be the most respected. Case\nand point, the resurrection. By claiming that the resurrection\nactually happened the early preachers were able to convert many\nto Christianity. However, if you read Mathew 27:38 (?) and the\ncase for the resurrected saints who walked around Jerusalem and\nappeared to "many People" you would realize that other\nhistorians (Josephus for one) would have reported on it all if\nit happened. The fact that the Bible speaks of events of such\ngreat magnitude that they would have been noticed taken with\nthe fact that they are not reported on by historians could only\nmean that the bible contains many made up stories.\n',
  "\nI don't think such tools exist either. In addition, there's no such\nthing as objective information. All together, it looks like religion\nand any doctrines could be freely misused to whatever purpose.\n\nThis all reminds me of Descart

### Токенизация текста

In [3]:
# Максимальная длина последовательности
max_length = 200

# Инициализация токенизатора
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Преобразование текстов в числовые последовательности
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Применение паддинга
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Просмотр размера преобразованных данных
X_train_padded.shape, X_test_padded.shape

((2370, 200), (1017, 200))

### Построение архитектуры свёрточной нейронной сети

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense

# Параметры модели
vocab_size = 10000
embedding_dim = 128
num_classes = len(categories)

# Создание модели
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')  # Для многоклассовой классификации
])

# Компиляция модели
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Просмотр структуры модели
model.summary()



### Обучение модели и оценка её метрик

In [5]:
# Обучение модели
history = model.fit(
    X_train_padded,
    y_train,
    validation_data=(X_test_padded, y_test),
    epochs=10,
    batch_size=32,
    verbose=2
)

# Оценка модели на тестовой выборке
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test, verbose=0)

# Вывод метрик
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/10
75/75 - 7s - 91ms/step - accuracy: 0.3325 - loss: 1.3415 - val_accuracy: 0.4700 - val_loss: 1.1437
Epoch 2/10
75/75 - 4s - 56ms/step - accuracy: 0.6709 - loss: 0.7417 - val_accuracy: 0.4671 - val_loss: 2.2178
Epoch 3/10
75/75 - 7s - 100ms/step - accuracy: 0.8515 - loss: 0.3806 - val_accuracy: 0.7325 - val_loss: 0.6031
Epoch 4/10
75/75 - 8s - 100ms/step - accuracy: 0.9473 - loss: 0.1518 - val_accuracy: 0.7286 - val_loss: 0.7336
Epoch 5/10
75/75 - 7s - 90ms/step - accuracy: 0.9709 - loss: 0.0841 - val_accuracy: 0.7355 - val_loss: 0.7286
Epoch 6/10
75/75 - 4s - 53ms/step - accuracy: 0.9722 - loss: 0.0629 - val_accuracy: 0.7640 - val_loss: 0.7262
Epoch 7/10
75/75 - 5s - 71ms/step - accuracy: 0.9751 - loss: 0.0536 - val_accuracy: 0.7571 - val_loss: 0.7813
Epoch 8/10
75/75 - 9s - 119ms/step - accuracy: 0.9743 - loss: 0.0508 - val_accuracy: 0.7552 - val_loss: 0.8401
Epoch 9/10
75/75 - 6s - 75ms/step - accuracy: 0.9755 - loss: 0.0478 - val_accuracy: 0.7522 - val_loss: 0.9005
Epoch 1

### Дополнительный анализ метрик

In [6]:
from sklearn.metrics import classification_report

# Предсказания модели
y_pred = model.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=1)

# Отчёт классификации
report = classification_report(y_test, y_pred_classes, target_names=categories)
print(report)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
                    precision    recall  f1-score   support

       alt.atheism       0.68      0.65      0.66       224
     comp.graphics       0.89      0.90      0.89       297
         sci.space       0.76      0.84      0.80       307
talk.religion.misc       0.62      0.54      0.58       189

          accuracy                           0.76      1017
         macro avg       0.74      0.73      0.73      1017
      weighted avg       0.75      0.76      0.75      1017

