In [None]:
import pandas as pd
import json
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data files (only the first time)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')
# Load JSON data into a DataFrame
with open('C:/Users/w065pxg/Desktop/ML/Tanvi Banerjee 2024 MI/Final Project/StudentResearch/RedditData.json', 'r', encoding='latin-1') as file:
    data = json.load(file)

# Flatten the nested JSON and extract the 'text_body' for each post
posts = []

# Loop through the nested 'flair' data structure to extract 'text_body'
for flair_category in data.get("flair", {}):
    for post_id in data["flair"][flair_category]:
        post = data["flair"][flair_category][post_id]
        # Extract relevant data: 'title', 'author', and 'text_body'
        posts.append({
            'title': post.get('title', ''),
            'author': post.get('author', ''),
            'text_body': post.get('text_body', ''),
            'num_comments': post.get('num_comments', 0),
        })

# Convert extracted data into a DataFrame
df = pd.DataFrame(posts)

# Check the DataFrame structure to ensure proper extraction
print(df.head())

# Ensure all values in 'text_body' are strings and handle NaN values
df['text_body'] = df['text_body'].astype(str).fillna('')

# Define the stopword list
stop_words = set(stopwords.words('english'))

# Define text preprocessing function
def preprocess(text):
    if not isinstance(text, str):  # Ensure text is a string
        return []  # Return empty list if not a string
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords and non-alphabetic tokens
    return tokens
# Apply preprocessing to the 'text_body' column
df['processed'] = df['text_body'].apply(preprocess)

# print(df[['title', 'processed']].head())  # Show processed data

# Create dictionary and corpus for LDA modeling
dictionary = corpora.Dictionary(df['processed'])
corpus = [dictionary.doc2bow(text) for text in df['processed']]

# Train LDA model
num_topics = 10  # Define number of topics
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# Display topics
print("\nLDA Topics:")
for idx, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {idx+1}: {topic}")

# Assign the dominant topic for each document
def get_dominant_topic(doc):
    topic_distribution = lda_model.get_document_topics(doc)
    if topic_distribution:
        return max(topic_distribution, key=lambda x: x[1])[0]  # Return the topic with the highest probability
    return None

df['dominant_topic'] = [get_dominant_topic(doc) for doc in corpus]

# Show DataFrame with topics assigned
print(df[['title', 'dominant_topic']].head())

# Visualize the topics using pyLDAvis
lda_visualization = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_visualization)



                                               title                author  \
0                  Would you pay to talk to someone?              Lie_Hour   
1                         I'm tired of feeling alone   Delicious_Guava1577   
2  Finding myself so alone into my late 30s is li...     Strange_Lake_2418   
3                         Iâm tired of being alone  Electrical-Pace-1141   
4  I don't want anything, I just want someone who...       BothProduce9986   

                                           text_body  num_comments  
0  Exactly that. Would/have you pay(paid) to talk...             7  
1  I met with my therapist today and she pointed ...             1  
2  I just feel like I'm so alone. I have lost the...             3  
3                             It just hurts so much              1  
4  I feel like I'm alone in this world, as if I'm...             1  

LDA Topics:
Topic 1: 0.021*"friends" + 0.019*"like" + 0.013*"someone" + 0.012*"feel" + 0.011*"people"
Topic 2: 0.020

In [10]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim import corpora, models
from gensim.models import CoherenceModel

# Create a dictionary and corpus
dictionary = corpora.Dictionary(df['processed'])  
corpus = [dictionary.doc2bow(text) for text in df['processed']]  

# Function to find the best LDA model
def train_lda(num_topics, passes, alpha='auto', eta='auto'):
    lda_model = models.LdaModel(
        corpus=corpus, 
        num_topics=num_topics, 
        id2word=dictionary, 
        passes=passes, 
        alpha=alpha, 
        eta=eta
    )
    # Compute coherence score
    coherence_model = CoherenceModel(model=lda_model, texts=df['processed'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return lda_model, coherence_score

# Try different topic numbers and choose the best
best_score = -1
best_model = None
best_num_topics = None

for num_topics in range(2, 6):  # Testing topics from 2 to 5
    model, score = train_lda(num_topics, passes=10)
    print(f'Num Topics: {num_topics}, Coherence Score: {score}')
    if score > best_score:
        best_score = score
        best_model = model
        best_num_topics = num_topics

print(f'Best Model: {best_num_topics} topics with Coherence Score: {best_score}')

# Assign dominant topic to each document
df['dominant_topic'] = [max(best_model[doc], key=lambda x: x[1])[0] for doc in corpus]

# Visualize the best LDA model
lda_visualization = gensimvis.prepare(best_model, corpus, dictionary)
pyLDAvis.display(lda_visualization)


Num Topics: 2, Coherence Score: 0.3819057199003761
Num Topics: 3, Coherence Score: 0.38114343324820094
Num Topics: 4, Coherence Score: 0.3793832699638151
Num Topics: 5, Coherence Score: 0.3517828110427475
Best Model: 2 topics with Coherence Score: 0.3819057199003761
