# **TopicModelling Medium Headlines**

# **Data: Medium Articles**

In [1]:
# check if we are using google colab
from pathlib import Path
import textwrap
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount("/content/drive")
    !pip install datasets transformers bertopic umap hdbscan  openai -U -qq

    base_folder = Path("/content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW8")

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.9/221.9 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     

In [None]:
data_path = base_folder/'titles_cleaned.csv'
model_folder = base_folder/'Models'
model_folder.mkdir(exist_ok=True, parents=True)


# **Task1: Data Preparation:**

In [None]:
import torch
from datasets import load_dataset, Dataset, load_from_disk
import pandas as pd

torch.cuda.is_available()

True

In [None]:
df = pd.read_csv(data_path)
dataset = Dataset.from_pandas(df)
# Save the dataset
dataset.save_to_disk(base_folder/'medium_titles')
# Load the dataset
dataset = load_from_disk(base_folder/'medium_titles')

Saving the dataset (0/1 shards):   0%|          | 0/28383 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['Unnamed: 0', 'url', 'title', 'claps', 'responses', 'readTime', '$oid', '$date', 'datetime', 'year', 'month', 'date', 'hour', 'minute', 'second'],
    num_rows: 28383
})

In [None]:
dataset.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'url': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'claps': Value(dtype='float64', id=None),
 'responses': Value(dtype='float64', id=None),
 'readTime': Value(dtype='string', id=None),
 '$oid': Value(dtype='string', id=None),
 '$date': Value(dtype='string', id=None),
 'datetime': Value(dtype='string', id=None),
 'year': Value(dtype='float64', id=None),
 'month': Value(dtype='float64', id=None),
 'date': Value(dtype='float64', id=None),
 'hour': Value(dtype='float64', id=None),
 'minute': Value(dtype='float64', id=None),
 'second': Value(dtype='float64', id=None)}

# **Extract Meta data**

In [None]:
titles = dataset["title"]

# **Get Sentence Embeddings and Save them on the disk**

In [None]:
from sentence_transformers import SentenceTransformer
import joblib

# # We load our model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

# # The titles are converted to vector representations
# embeddings = embedding_model.encode(titles)
# joblib.dump(embeddings,model_folder/'medium_nlp_title_embeddings')

embeddings_medium_titles = joblib.load(model_folder/'medium_nlp_title_embeddings')

embeddings_medium_titles.shape

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

(28383, 768)

# **Topic Modeling**

# **Task2: Initial BERTopic Analysis**

- **Use SentenceBERT for creating embeddings. Save these embeddings for subsequent analysis**
- **Use UMAP for dimensionality reduction**
- **Use HDBSCAN for clustering**
- **Initially employ c-TF-IDF to understand topic representation**


In [None]:
# Default pipeline can be implemented in three lines of code
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
topic_model = BERTopic()

In [None]:
# Improvement1 : we already have saved  embeddings, so we can use them to speed up the process

# improvement 2: add random_state to make the results reproducible
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# improvement 3: add min_cluster_size to remove small topics
# we earlier used 15, let us try 40
hdbscan_model = HDBSCAN(min_cluster_size=40, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# improvement 4: Enhance topic representation by removing stopwords, infrequent words, and using bigrams
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [None]:
# Creating the BERTopic model with embedding model

# topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model)
# topics, probs = topic_model.fit_transform(documents=abstracts)

# Creating the BERTopic model without embedding model
topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model,
                       hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model,
                       calculate_probabilities=True)
topics, probs = topic_model.fit_transform(embeddings=embeddings_medium_titles, documents=titles)

- **Analyze the generated topics for coherence and relevance**

In [None]:
topic_model.get_topic_info().head(20)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10741,-1_medium_data_using_ai,"[medium, data, using, ai, life, make, learning...",[Understanding Self-Organising Map Neural Netw...
1,0,621,0_design_ux_collective_ux collective,"[design, ux, collective, ux collective, ui, de...","[Why so many Garamonds? - UX Collective, What ..."
2,1,577,1_seo_search_website_search engine,"[seo, search, website, search engine, engine, ...",[SEO Guide: To Show You Path for SEO - Chandni...
3,2,418,2_startup_founders_venture_entrepreneur,"[startup, founders, venture, entrepreneur, sta...",[Is Your Startup a Good Fit for Venture Capita...
4,3,412,3_instagram_followers_reels_medium instagram,"[instagram, followers, reels, medium instagram...",[How To Make The Best Out Of Your Time On Inst...
5,4,381,4_medium_insights medium_jessica_blue,"[medium, insights medium, jessica, blue, holid...","[If You Try To Do Everything, You Won’t Do Any..."
6,5,379,5_data_data engineering_data science_science,"[data, data engineering, data science, science...",[From Data Engineering to Prompt Engineering -...
7,6,364,6_linkedin_profile_leads_recruiters,"[linkedin, profile, leads, recruiters, using l...",[5 Tools for LinkedIn Marketing - Sangita Cham...
8,7,340,7_brand_branding_identity_brand identity,"[brand, branding, identity, brand identity, lo...","[The Social Brand, Brand, Branding and Brand I..."
9,8,337,8_b2b_sales_marketing_b2b sales,"[b2b, sales, marketing, b2b sales, b2b marketi...","[UX Design for the B2B Sales Funnel, The B2B ..."


In [None]:
topic_model.get_topic(0)

[('ux', 0.06592439145786302),
 ('design', 0.06549051932629224),
 ('collective', 0.040765707518791376),
 ('ux collective', 0.04035778648216826),
 ('ui', 0.033589478862998776),
 ('designer', 0.025541471422365476),
 ('ux design', 0.025460751456712474),
 ('designers', 0.02397647160984639),
 ('ui ux', 0.020800567975427755),
 ('case study', 0.02054463265617737)]

**Find Topics related to a keyword**

In [None]:
import numpy as np
embedding_model = SentenceTransformer('all-mpnet-base-v2')
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_topics(embedding_model, topic_model, search_term, top_n=5):
    # Get sorted list of topic IDs from BERTopic model
    topic_list = list(topic_model.topic_representations_.keys())
    topic_list.sort()

    # Generate search term embedding
    search_term_embedding = embedding_model.encode([search_term]).flatten()

    # Compute cosine similarity
    similarities = cosine_similarity(search_term_embedding.reshape(1, -1), topic_model.topic_embeddings_).flatten()

    # Find top n similar topics
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    similar_topics = [topic_list[i] for i in top_indices]
    similarity_scores = [similarities[i] for i in top_indices]


    return similar_topics, similarity_scores

In [None]:
# Usage
similar_topics, similarity_scores = find_similar_topics(embedding_model, topic_model, "artificial intelligence", top_n=5)
similar_topics, similarity_scores

([90, 78, 83, 98, 128],
 [0.63014567, 0.6127476, 0.55421734, 0.5410559, 0.5394019])

In [None]:
print(topic_model.get_topic(90))
print("\n Representative docs of this cluster or topic are :{}".format(topic_model.get_representative_docs(90)))

[('ai', 0.07927404546479011), ('intelligence', 0.07351708080175273), ('artificial', 0.05055009913600644), ('artificial intelligence', 0.05022671053396143), ('replaced', 0.049804916122579035), ('replaced ai', 0.04382326386691487), ('ai changing', 0.04382326386691487), ('work', 0.03977152123819041), ('human', 0.03570632718366472), ('crisis', 0.032743371850298464)]

 Representative docs of this cluster or topic are :['AI Is Changing What Intelligence Means: Here’s How To Become Valuable', 'AI Is Changing What Intelligence Means: Here’s How To Become Valuable', 'AI Is Changing What Intelligence Means: Here’s How To Become Valuable']


In [None]:
topic_model.get_topic(78)

[('ai tools', 0.15487077505129165),
 ('tools', 0.11114731094627596),
 ('ai', 0.07058373287999906),
 ('free ai', 0.06435581643157848),
 ('productivity', 0.03936979969289151),
 ('best ai', 0.032158154168304975),
 ('free', 0.02867736591257012),
 ('artificial', 0.02689538964889113),
 ('tools try', 0.024107035086339542),
 ('try ai', 0.024107035086339542)]

In [None]:
topic_model.get_representative_docs(78)

['5 Free AI Tools that You Must Try',
 '5 AI Tools That Will Supercharge Your Productivity',
 'Get More Done in Less Time with These 16 Free AI Tools']

In [None]:
topic_model.get_topic(83)

[('generative ai', 0.24844392421628927),
 ('generative', 0.24017081059765574),
 ('medium generative', 0.08588469916426494),
 ('ai', 0.07439676259413729),
 ('ai generative', 0.06551033356676622),
 ('ai change', 0.03184642446966135),
 ('landscape', 0.029085244036260807),
 ('future', 0.025122344345165915),
 ('change 2023', 0.024804346018589038),
 ('like midjourney', 0.024804346018589038)]

In [None]:
topic_model.get_representative_docs(83)

['Where are the opportunities for new startups in generative AI?',
 'A Beginner’s Guide to Deciphering Generative AI',
 'Generative AI: Time for Scrutiny']

In [None]:
topic_model.get_topic(98)

[('ai', 0.059385404900552254),
 ('ai marketing', 0.05920989785186091),
 ('marketing', 0.03678521619365308),
 ('ai tools', 0.0320377925680739),
 ('tools', 0.03050969179698731),
 ('marketing tools', 0.025932903056287707),
 ('generic', 0.025375670507940394),
 ('transforming', 0.024719472521358317),
 ('intelligence', 0.023007856620342753),
 ('era', 0.022840327929054105)]

In [None]:
topic_model.get_representative_docs(98)

['How to Stand Out as a Marketer in the Era of Generic AI Marketing?',
 'How to Stand Out as a Marketer in the Era of Generic AI Marketing?',
 'How to Stand Out as a Marketer in the Era of Generic AI Marketing?']

In [None]:
topic_model.get_topic(128)

[('ai ux', 0.07392206845247519),
 ('designers', 0.07041465895312615),
 ('ai design', 0.06466564718064363),
 ('ux', 0.059656881330170705),
 ('design', 0.057542244600205),
 ('ai', 0.05469507101800757),
 ('10 ai', 0.0536385941552412),
 ('ux design', 0.04971193137230897),
 ('opportunities challenges', 0.04796785096467664),
 ('design opportunities', 0.04796785096467664)]

In [None]:
topic_model.get_representative_docs(128)

['The Impact of AI on UX Design: Opportunities and Challenges',
 'The Impact of AI on UX Design: Opportunities and Challenges',
 'The Impact of AI on UX Design: Opportunities and Challenges']

- **Apply Hierarchical Clustering to find subgroups within the topics**

In [None]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
# linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
# hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

hierarchical_topics = topic_model.hierarchical_topics(titles)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 131/131 [00:02<00:00, 60.01it/s]


- **Examine the clusters and subclusters, providing a comprehensive conclusion of the findings**

**Conclusion:**

To sum it all up, for example, the groups labeled 89, 127, and 32 are all about big data tech – they kind of stick together in a neat bundle. On the flip side, clusters 103, 11, and 74 are like a team, especially with subgroups focusing on Google Ads and Facebook Ads. It's pretty obvious they're all into the advertising scene. Simply put, using BERTopic Analysis to figure things out showed that Medium articles are nicely sorted out by common topics in these clusters and subclusters.

In [None]:
# Save original representations
from copy import deepcopy
original_topics = deepcopy(topic_model.topic_representations_)

In [None]:
def compare_topic_changes(new_model, original_topic_words, max_length=75, top_n_topics=10):

    """Displays differences in top words of topic representations between the original and new models."""

    for topic_id in range(top_n_topics):
        # Extract top 5 words per topic from the original and new models
        original_top_words = "_".join(word for word, _ in original_topic_words[topic_id][:5])
        new_top_words = "_".join(word for word, _ in new_model.get_topic(topic_id)[:5])

        # Calculate whitespace for alignment
        alignment_spaces = " " * (max_length - len(original_top_words))

        # Print 'before' and 'after' topic word changes
        print(f"Topic: {topic_id}    {original_top_words}{alignment_spaces} >>     {new_top_words}")


# **Task3: KeyBERTInspired**
- **Update the topic representation using the KeyBERTinspired approach**




In [None]:
# KeyBERTInspired
from bertopic.representation import KeyBERTInspired
representation_model = KeyBERTInspired()

# Update our topic representations
topic_model.update_topics(titles, representation_model=representation_model)

- **Compare the topic representation frkim step1 with the KeyBERTinspired approach**

In [None]:
# Show topic differences
compare_topic_changes(topic_model, original_topics)

Topic: 0    design_ux_collective_ux collective_ui                                       >>     designs_design_designing_ux_designers
Topic: 1    seo_search_website_search engine_engine                                     >>     seo_marketing_indexing_keywords_google
Topic: 2    startup_founders_venture_entrepreneur_startups                              >>     startups_startup_investors_founders_entrepreneurship
Topic: 3    instagram_followers_reels_medium instagram_instagram followers              >>     instagram_insta_ig_marketing_media
Topic: 4    medium_insights medium_jessica_blue_holiday medium                          >>     song_jessica_music_wildfire_medium
Topic: 5    data_data engineering_data science_science_data analyst                     >>     data_analyst_analytics_analytical_study
Topic: 6    linkedin_profile_leads_recruiters_using linkedin                            >>     linkedin_recruiters_jobs_marketing_freelance
Topic: 7    brand_branding_identity_brand identity

In [None]:
topic_model.get_topic(0, full=True)

{'Main': [('designs', 0.5236881),
  ('design', 0.5090069),
  ('designing', 0.49625993),
  ('ux', 0.4818933),
  ('designers', 0.47707564),
  ('ui', 0.44034895),
  ('trends', 0.4198845),
  ('innovation', 0.399446),
  ('industry', 0.37858674),
  ('usability', 0.36983)]}

- **Reapply Hierarchical Clustering to discover subgroups under this new representation**

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(titles)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 132/132 [00:20<00:00,  6.59it/s]


**Conclusion:**

If we look at the hierarchial clustering from keyBert Inspired method.For example 102,9,79,108 are all related to ChatGpt and were clustered together. Another example 24,57 are related to crypto currency and are clustered togerther. One outlier I See is the chinese character in cluster 76 which isn't linked to other elements which are based on linkedin and internet. All the subcluster look accurate and hierarchial has done a good job of clustering like topics.  

In [None]:
!sudo apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic

In [3]:
!jupyter nbconvert --to pdf /content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW8/Pooja_Akkaladevi_HW_8.ipynb

[NbConvertApp] Converting notebook /content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW8/Pooja_Akkaladevi_HW_8.ipynb to pdf
  warn(
  warn(
[NbConvertApp] Writing 89883 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 92944 bytes to /content/drive/MyDrive/Pooja_HP_Singh_Projects/NLP/HW8/Pooja_Akkaladevi_HW_8.pdf
