In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec, CoherenceModel, LdaModel
from gensim.corpora.dictionary import Dictionary
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np
from sklearn.datasets import fetch_20newsgroups

nltk.download('punkt')
nltk.download('stopwords')

# Load the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data = pd.DataFrame({'text': newsgroups_data.data, 'target': newsgroups_data.target})

# Assuming the dataset has columns 'text' and 'category'
# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

data['processed_text'] = data['text'].apply(preprocess_text)

# Convert processed text into a list of lists of tokens
texts = data['processed_text'].tolist()

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, workers=4)

# Generate document vectors
def document_vector(text, model):
    doc_vector = np.mean([model.wv[word] for word in text if word in model.wv], axis=0)
    return doc_vector

data['doc_vector'] = data['processed_text'].apply(lambda x: document_vector(x, word2vec_model))
data = data.dropna(subset=['doc_vector'])

# Prepare document vectors and labels for classification
X = np.vstack(data['doc_vector'])
y = data['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification Accuracy: {accuracy}')

# Use LDA to find topics
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Train LDA model
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=15)

# Evaluate topic coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Topic Coherence: {coherence_lda}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  return _methods._mean(a, axis=axis, dtype=dtype,


Classification Accuracy: 0.48893744878448514
Topic Coherence: 0.6733247414179756


In [5]:
# Evaluate topic coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Topic Coherence: {coherence_lda}')

Topic Coherence: 0.6733247414179756
