# Step 0: Import Everything we need

In [1]:
# The Basics
import pandas as pd
import numpy
from pprint import pprint

In [3]:
# The Processing
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JSCHNEE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# The Modell
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel

# Step 1: Loading and Cleaning

In [7]:
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [15]:
unique_categories = len(df.category.unique())

In [17]:
df = df["short_description"]

df = df.map(lambda x: re.sub('[,\.!?]', '', x))
df = df.map(lambda x: x.lower())

df.head()

0    health experts said it is too early to predict...
1    he was subdued by passengers and crew when he ...
2    "until you have a dog you don't understand wha...
3    "accidentally put grown-up toothpaste on my to...
4    amy cooper accused investment firm franklin te...
Name: short_description, dtype: object

In [19]:
def preprocess_data(documents):
    stop_words = stopwords.words("english")
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in documents]
    return texts

processed_texts = preprocess_data(df)

# Step 2: Create a Dictionary and Corpus
Dictionary: Mapping between Words and ID
<br>Corpus: List of Documents as "Bag of Words"

In [21]:
id2word = corpora.Dictionary(processed_texts)

corpus = [id2word.doc2bow(text) for text in processed_texts]

# Step 3: Train the LDA Model

In [23]:
lda_model = LdaModel(corpus=corpus, id2word=id2word, 
                    num_topics=unique_categories, 
                    random_state=42,
                    passes=10,
                    alpha="auto",
                    per_word_topics=True)

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=id2word, coherence="c_v")
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence Score:", coherence_lda)

# Step 4: pyLDAvis

In [None]:
import pyLDAvis
from pyLDAvis import gensim_models
pyLDAvis.enable_notebook()

In [None]:
visualisation = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

In [None]:
visualisation