In [1]:
import pandas as pd
import os
import string
from sklearn.model_selection import train_test_split

# Import Gensim for corpus and model
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import NLTK for stopwords
import nltk
from nltk.corpus import stopwords

In [2]:
# Reading the BBC news dataset
data_folder=".../bbc"

folders=["business","entertainment","politics","sport","tech"]
x=[]
y=[]


for i in folders:
    files=os.listdir(data_folder+'/'+i)
    for text_file in files:
        file_path=data_folder + '/'+i+'/'+text_file
        with open(file_path,'rb') as f:
            data=f.read()
        x.append(data)
        y.append(i)
        
data={'text':x,'type':y}
df = pd.DataFrame(data)

In [3]:
# Use gensim's simple preprocess to tokenize words from sentences
def tokenize(sentences, deacc=True):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence)))

In [4]:
# Get stopwords from NLTK's model of english stopwords
stop_words = stopwords.words('english')

# Def function for removing stopwords in each tokenized text body
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [5]:
# Separate data and tokenize body texts
data = df['text'].values.tolist()
data_words = list(tokenize(data))

In [6]:
# Filter stopwords from tokenized texts
data_no_stopwords = remove_stopwords(data_words)

In [7]:
# Build dictionary from POS tagged data
dictionary = corpora.Dictionary(data_no_stopwords)
len(dictionary)

29656

*Note*

There should be 27,676 words indexed using the gensim corpora. However, I am getting 29,656 indexed words in the dictionary after removing stopwords. When using lemmatization used in the guide, this reduces to 18,241 indexed words. this might be due to updates with the nltk's model of english stopwords.

In [8]:
# Build corpus using lemmatized texts
texts = data_no_stopwords
corpus = [dictionary.doc2bow(text) for text in texts]

In [10]:
# Gather labels as a list for passing through train-test split
labels = df['type'].values.tolist()
len(labels)

2225

In [11]:
# Run train test split
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, random_state=42)

In [12]:
# Train model using train-test split
LDA_model = gensim.models.ldamodel.LdaModel(corpus=X_train,
                                            id2word=dictionary,
                                            num_topics=5,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto")
                                            #per_word_topics=True)

In [13]:
# Get coherence score
coherence_model_lda = CoherenceModel(model=LDA_model, texts=data_no_stopwords, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(coherence_lda)

0.4573562264949323
