In [1]:
! pip install pandas gensim spacy nltk matplotlib

Collecting pandas
  Downloading pandas-2.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting spacy
  Using cached spacy-3.8.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-1.26.4-cp312-cp31

In [2]:
! pip install --upgrade pip

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.1.1


In [46]:
import pandas as pd
import gensim
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import matplotlib.pyplot as plt
import spacy

In [32]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/aadimprajapati/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/aadimprajapati/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/aadimprajapati/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/aadimprajapati/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/aadimprajapati/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagg

True

In [33]:
data={
    'text':[
        "The government passed a new bill aimed at improving education standards across the country. It includes plans to build new schools and hire more teachers.",
        "Artificial Intelligence is transforming industries from healthcare to finance. Machine learning algorithms help in detecting fraud and diagnosing diseases.",
        "The football championship attracted millions of viewers worldwide. Teams competed in a month-long tournament showcasing incredible skill and sportsmanship.",
        "Climate change continues to be a pressing issue. Rising temperatures and melting ice caps are signs of a rapidly warming planet.",
        "The latest smartphone model comes with a better camera and improved battery life. Consumers are excited about the enhanced user experience.",
        "A recent study showed that a balanced diet and regular exercise significantly reduce the risk of heart disease.",
        "SpaceX launched another batch of satellites into orbit as part of its global internet project, Starlink. This could revolutionize internet access in remote areas.",
        "Cryptocurrencies like Bitcoin and Ethereum are seeing widespread adoption. However, regulations around digital currencies remain uncertain.",
        "A popular cooking show returned for another season. Contestants showcased culinary creativity under time pressure.",
        "Mental health awareness is gaining importance. Schools and workplaces are implementing programs to support emotional well-being."
    ]
}

df = pd.DataFrame(data)

In [38]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()

    tokens = word_tokenize(text)
    filtered = [ token for token in tokens if token.isalpha() and token not in stop_words ]

    lemmas = [wnl.lemmatize(token) for token in filtered]

    return lemmas

df['processed_text'] = df['text'].apply(preprocess_text)

In [39]:
import gensim.corpora as corpora

#corpora maps words to their integer ids
#doc2bow converts collection of words to bag of words representation

id_word = corpora.Dictionary(df['processed_text'])
corpus = [id_word.doc2bow(text) for text in df['processed_text']]

<h2>LDA Model</h2>
<b>Syntax</b>: gensim.models.ldamodel.LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={})

In [40]:
model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id_word, num_topics=3, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

<h2>Output Structure</h2>

Each of the output tuple is of form: 
(topic_id, "word1*weight1 + word2*weight2 + ...")

In [41]:
topics = model.print_topics(num_words = 10)

for topic in topics:
    print(topic)

(0, '0.028*"internet" + 0.016*"school" + 0.016*"another" + 0.016*"disease" + 0.016*"balanced" + 0.016*"exercise" + 0.016*"importance" + 0.016*"gaining" + 0.016*"diet" + 0.016*"showed"')
(1, '0.016*"disease" + 0.016*"intelligence" + 0.016*"finance" + 0.016*"industry" + 0.016*"million" + 0.016*"excited" + 0.016*"transforming" + 0.016*"showcasing" + 0.016*"fraud" + 0.016*"machine"')
(2, '0.023*"new" + 0.013*"sign" + 0.013*"standard" + 0.013*"rapidly" + 0.013*"another" + 0.013*"warming" + 0.013*"pressing" + 0.013*"continues" + 0.013*"rising" + 0.013*"passed"')


<h2>Coherence score</h2>

<p>Metric used to evaluate the quality of a topic model by assessing how well the words within a topic relate to each other</p>
<p>It ranges from 0 to 1 in which high coherence is nearer to 1 and low coherence is nearer to 0</p>

In [45]:
from gensim.models import CoherenceModel

# Compute coherence score
coherence_model_lda = CoherenceModel(model=model, texts=df['processed_text'], dictionary=id_word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.2762989029085951


<h2>Refining model and increasing coherence score</h2>

Ways to refine model:
- Tuning num_topics
- adding bigrams/trigrams

In [None]:
# refining model and dataset
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 
    
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    filtered = [token for token in tokens if token.isalpha() and token not in stop_words]
    pos_tags = nltk.pos_tag(filtered)
    lemmas = [wnl.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]
    return lemmas

df['processed_text'] = df['text'].apply(preprocess_text)

In [82]:
import gensim.corpora as corpora

id_word = corpora.Dictionary(df['processed_text'])
corpus = [id_word.doc2bow(text) for text in df['processed_text']]
model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id_word, num_topics=3, random_state=100, update_every=0, chunksize=2, passes=20, alpha='auto', per_word_topics=True)

In [83]:
topics = model.print_topics(num_words = 10)
for topic in topics:
    print(topic)

(0, '0.027*"new" + 0.015*"plan" + 0.015*"standard" + 0.015*"education" + 0.015*"build" + 0.015*"aim" + 0.015*"bill" + 0.015*"teacher" + 0.015*"country" + 0.015*"hire"')
(1, '0.014*"show" + 0.014*"disease" + 0.014*"another" + 0.013*"school" + 0.013*"improve" + 0.010*"internet" + 0.008*"ethereum" + 0.008*"widespread" + 0.008*"digital" + 0.008*"uncertain"')
(2, '0.019*"internet" + 0.012*"access" + 0.012*"part" + 0.012*"revolutionize" + 0.012*"starlink" + 0.012*"orbit" + 0.012*"project" + 0.012*"batch" + 0.012*"area" + 0.012*"global"')


In [84]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=model, texts=df['processed_text'], dictionary=id_word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5453902523265233
