In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import gensim
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')

# Load the BBC News dataset
data = pd.read_csv('BBC News Train.csv')

# Assuming the dataset has columns 'text' and 'category'
# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

data['processed_text'] = data['Text'].apply(preprocess_text)

# Convert processed text into a list of lists of tokens
texts = data['processed_text'].tolist()

# Create a dictionary and a corpus for LDA
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Evaluate topic coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Topic Coherence: {coherence_lda}')

# Transform the corpus into topic distributions
def get_document_topics(corpus, lda_model, num_topics):
    topics = []
    for doc in corpus:
        topic_distribution = [0] * num_topics
        for topic, prob in lda_model.get_document_topics(doc):
            topic_distribution[topic] = prob
        topics.append(topic_distribution)
    return topics



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Topic Coherence: 0.4019149727110219


In [5]:
num_topics = 5
document_topics = get_document_topics(corpus, lda_model, num_topics)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(document_topics, data['Category'], test_size=0.2, random_state=42)

# Train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification Accuracy: {accuracy}')

Classification Accuracy: 0.7416107382550335
