In [None]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import matplotlib.pyplot as plt
from tqdm import tqdm
from multiprocessing import Pool

In [None]:

# Load the dataset
df = pd.read_csv('newyorktimes.csv')
df = df[df['publication_date'] >= '2019-01-01']

# Handle NaN values
df['title'] = df['title'].fillna('')
df['body'] = df['body'].fillna('')
# drop the rows having no body
df = df[df['body'] != '']

# Concatenating title and body
df['title_body'] = df['title'] + ' ' + df['body']
# Preprocessing
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()





In [None]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized



In [None]:
def topic_modeling(num_topics):
    model = gensim.models.ldamodel.LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=num_topics, random_state=100, 
                                            chunksize=1000, passes=50)
    coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    return coherencemodel.get_coherence()

doc_clean = [clean(doc).split() for doc in tqdm(df['title_body'], desc="Cleaning Data")] 

dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Number of CPUs to use for parallel processing. Adjust if necessary.
num_cpus = 4 

coherence_values = []
with Pool(num_cpus) as pool:
    coherence_values.extend(list(tqdm(pool.imap(topic_modeling, range(100, 201, 10)), desc="Topic Modeling", total=11)))

print(coherence_values)

In [None]:
# Plotting the coherence scores
plt.plot(range(100, 201, 10), coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc='best')
plt.show()
