In [1]:
# Step 1: Import necessary libraries
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [2]:
# Step 2: Load the dataset
# Specify dtype for relevant columns to avoid warnings
dtype = {'abstract': str}  # You can specify more columns if needed
df = pd.read_csv('nyt-metadata.csv', dtype=dtype, usecols=['abstract'])

In [3]:
# Step 3: Inspect the dataset
print("Sample rows")
print(df.head())

Sample rows
                                            abstract
0  Article on upcoming New York Giants-Dallas Cow...
1  Jeanne C Pond letter expresses hope that spiri...
2  Many experts on Y2K computer problem report th...
3  WILL the forces of globalism continue to push ...
4   SPECIAL TODAY  The Millennium  Envisioning th...


In [4]:
print("Column Names")
print(df.columns)

Column Names
Index(['abstract'], dtype='object')


In [5]:
# Check for missing values in the 'abstract' column
print(df['abstract'].isna().sum())

# Drop rows with missing values in the text column
df = df.dropna(subset=['abstract'])

31917


In [6]:
print("current no. of rows in df is: ",len(df))

current no. of rows in df is:  2155709


In [7]:
# Step 4: Download stopwords and lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Define stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [11]:
# Step 5: Preprocess the text
def preprocess(text):
    tokens = simple_preprocess(text, deacc=True)  # Tokenization and normalization
    tokens = [token for token in tokens if token not in stop_words]  # Stop word removal
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatization
    return tokens

# Apply preprocessing to each document
processed_docs = [preprocess(doc) for doc in df['abstract']]

In [12]:
# Print processed documents (first 5)
print("\nProcessed Documents:")
for i, doc in enumerate(processed_docs[:5]):
    print(f"Document {i + 1}: {doc}")


Processed Documents:
Document 1: ['article', 'upcoming', 'new', 'york', 'giant', 'dallas', 'cowboy', 'game', 'photo']
Document 2: ['jeanne', 'pond', 'letter', 'express', 'hope', 'spiritual', 'development', 'artistic', 'knowledge', 'skill', 'self', 'esteem', 'flourish', 'new', 'century', 'drawing']
Document 3: ['many', 'expert', 'computer', 'problem', 'report', 'internet', 'performed', 'impressively', 'rollover', 'even', 'sag', 'time', 'isolated', 'site', 'user', 'turned', 'mail', 'message', 'web', 'site', 'newsgroups', 'electronic', 'chat', 'room', 'track', 'arrival', 'year', 'time', 'zone']
Document 4: ['force', 'globalism', 'continue', 'push', 'world', 'toward', 'american', 'style', 'capitalism', 'st', 'century', 'begin', 'advocate', 'free', 'market', 'doubt', 'economic', 'argument', 'socialism', 'dead', 'moreover', 'mean', 'creating', 'wealth', 'material', 'progress', 'american', 'capitalism', 'seems', 'clearly', 'superior', 'asian', 'variety', 'greater', 'level', 'government', 'pl

In [13]:
# Step 6: Create a dictionary and corpus
dictionary = Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [15]:
# Display dictionary
#print("\nDictionary:")
#print(dictionary.token2id)
# Print the number of unique tokens in the dictionary
print(f"Number of unique tokens in the dictionary: {len(dictionary)}")

Number of unique tokens in the dictionary: 370663


In [16]:
# Display corpus
print("\nCorpus:")
for doc_id, doc in enumerate(corpus[:5]):
    print(f"Document {doc_id + 1}: {doc}")


Corpus:
Document 1: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
Document 2: [(5, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)]
Document 3: [(24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 2), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)]
Document 4: [(10, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1)]
Document 5: [(28, 1), (44, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (

In [19]:
# Step 7: Build the LDA model
num_topics = 15  # You can adjust the number of topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)

KeyboardInterrupt: 

In [None]:
# Step 8: Topic-word distribution
print("\nTopic-Word Distribution:")
topics = lda_model.show_topics(formatted=False)
topic_word_dist = {}
for topic_num, words in topics:
    topic_word_dist[topic_num] = {word: prob for word, prob in words}
    print(f"Topic {topic_num}: {topic_word_dist[topic_num]}")

In [None]:
# Step 9: Document-topic distribution
print("\nDocument-Topic Distribution:")
doc_topic_dist = []
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    doc_topic_dist.append(doc_topics)
    print(f"Document {i}: {doc_topics}")

In [None]:
# Step 10: Conditional probabilities for each word in each document
print("\nConditional Probabilities for each word in each document:")
conditional_probs = []
for i, doc in enumerate(corpus):
    doc_conditional_probs = []
    for word_id, freq in doc:
        word = dictionary[word_id]
        word_conditional_probs = []
        for topic_num in range(num_topics):
            prob = lda_model.get_term_topics(word_id, minimum_probability=0)[topic_num][1]
            word_conditional_probs.append((topic_num, prob))
        doc_conditional_probs.append((word, word_conditional_probs))
    conditional_probs.append(doc_conditional_probs)
    print(f"Document {i}: {doc_conditional_probs}")

In [None]:
# Step 11: Topic distribution for each document
print("\nTopic Distribution for each document:")
for i, doc in enumerate(corpus):
    doc_topic_dist = lda_model.get_document_topics(doc)
    print(f"Document {i}: {doc_topic_dist}")

# Step 12: Display the results in a tabular format using pandas
# Topic-word distribution
topic_word_df = pd.DataFrame(topic_word_dist).fillna(0)
print("\nTopic-Word Distribution (Tabular):")
print(topic_word_df)

In [None]:
# Document-topic distribution
doc_topic_df = pd.DataFrame([[prob for _, prob in lda_model.get_document_topics(doc, minimum_probability=0)] for doc in corpus])
doc_topic_df.columns = [f'Topic {i}' for i in range(num_topics)]
print("\nDocument-Topic Distribution (Tabular):")
print(doc_topic_df)

In [None]:
# Conditional probabilities for each word in each document
conditional_probs_df = []
for i, doc in enumerate(conditional_probs):
    for word, probs in doc:
        for topic_num, prob in probs:
            conditional_probs_df.append((f'Doc {i}', word, f'Topic {topic_num}', prob))
conditional_probs_df = pd.DataFrame(conditional_probs_df, columns=['Document', 'Word', 'Topic', 'Probability'])
print("\nConditional Probabilities (Tabular):")
print(conditional_probs_df)

In [None]:
# Step 13: Visualize the topics using pyLDAvis
# Prepare the pyLDAvis data
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

In [None]:
# Save the visualization to an HTML file
pyLDAvis.save_html(vis_data, 'nyt_lda_visualization.html')

In [None]:
# Display the visualization in the notebook
pyLDAvis.display(vis_data)