In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# Predict and evaluate
nb_predictions = nb_classifier.predict(X_test_vec)
nb_accuracy = accuracy_score(y_test, nb_predictions)
print(f"Naive Bayes Accuracy: {nb_accuracy}")

# Train a K-Means clustering model
kmeans = KMeans(n_clusters=20, random_state=42)
kmeans.fit(X_train_vec.toarray())

# Predict and evaluate
kmeans_predictions = kmeans.predict(X_test_vec.toarray())
kmeans_accuracy = accuracy_score(y_test, kmeans_predictions)
print(f"K-Means Accuracy: {kmeans_accuracy}")

# Tokenize and pad the text data for RNN
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Define the RNN model
rnn_model = Sequential()
rnn_model.add(Embedding(10000, 128))
rnn_model.add(LSTM(128))
rnn_model.add(Dense(20, activation='softmax'))

# Compile and train the RNN model
rnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn_model.fit(X_train_pad, y_train, epochs=5, validation_data=(X_test_pad, y_test))

# Note: Implementing LSTM and Transformer models is similar to the RNN model, but requires more detailed setup and training.
# For LSTM, replace LSTM with GRU in the model definition.
# For Transformer, consider using the transformers library by Hugging Face for pre-trained models or implementing from scratch.

Naive Bayes Accuracy: 0.5419098143236074
K-Means Accuracy: 0.05596816976127321
Epoch 1/5
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 134ms/step - accuracy: 0.1205 - loss: 2.8126 - val_accuracy: 0.2910 - val_loss: 2.0953
Epoch 2/5
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11434s[0m 24s/step - accuracy: 0.3700 - loss: 1.8691 - val_accuracy: 0.3785 - val_loss: 1.9719
Epoch 3/5
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 116ms/step - accuracy: 0.5732 - loss: 1.2917 - val_accuracy: 0.5239 - val_loss: 1.5383
Epoch 4/5
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 120ms/step - accuracy: 0.7205 - loss: 0.8788 - val_accuracy: 0.5668 - val_loss: 1.4726
Epoch 5/5
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 102ms/step - accuracy: 0.8133 - loss: 0.6222 - val_accuracy: 0.5968 - val_loss: 1.4665


<keras.src.callbacks.history.History at 0x1510fdf8980>