In [9]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim as gensimvis
import itertools

# Download NLTK data (only required once)
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Load JSON data
with open('C:/Users/w065pxg/Desktop/ML/Tanvi Banerjee 2024 MI/Final Project/StudentResearch/RedditData.json', 'r', encoding='latin-1') as file:
    data = json.load(file)

# Extract relevant data
posts = [
    {
        'title': post.get('title', ''),
        'author': post.get('author', ''),
        'text_body': post.get('text_body', ''),
        'num_comments': post.get('num_comments', 0),
    }
    for flair_category in data.get("flair", {})
    for post_id, post in data["flair"][flair_category].items()
]

# Convert to DataFrame
df = pd.DataFrame(posts)

# Ensure all values in 'text_body' are strings
df['text_body'] = df['text_body'].astype(str).fillna('')

# Define stopword list and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess(text):
    text = text.lower()  # Lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]  
    return tokens

# Apply preprocessing
df['processed'] = df['text_body'].apply(preprocess)

# Create dictionary and filter extreme words
dictionary = corpora.Dictionary(df['processed'])
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Remove very rare and very common words

# Create corpus
corpus = [dictionary.doc2bow(text) for text in df['processed']]

# **Hyperparameter tuning: Grid Search**
topic_range = [5, 10, 15]  # Number of topics to test
alpha_range = ['symmetric', 'asymmetric']
eta_range = ['symmetric', None]

param_grid = list(itertools.product(topic_range, alpha_range, eta_range))

best_model = None
best_coherence = float('-inf')

for num_topics, alpha, eta in param_grid:
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, alpha=alpha, eta=eta)
    
    coherence_model = CoherenceModel(model=lda_model, texts=df['processed'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    print(f"Topics: {num_topics}, Alpha: {alpha}, Eta: {eta} --> Coherence Score: {coherence_score:.4f}")

    if coherence_score > best_coherence:
        best_model = lda_model
        best_coherence = coherence_score

# Display best hyperparameters
print("\nBest Model Parameters:")
print(f"Topics: {best_model.num_topics}, Alpha: {best_model.alpha}, Eta: {best_model.eta}")
print(f"Best Coherence Score: {best_coherence:.4f}")

# Display topics
print("\nLDA Topics:")
for idx, topic in best_model.print_topics(num_words=5):
    print(f"Topic {idx+1}: {topic}")

# Assign dominant topic for each document
df['dominant_topic'] = [max(best_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in corpus]

# Display processed DataFrame
print(df[['title', 'dominant_topic']].head())

# Visualize LDA topics
lda_visualization = gensimvis.prepare(best_model, corpus, dictionary)
pyLDAvis.display(lda_visualization)


Topics: 5, Alpha: symmetric, Eta: symmetric --> Coherence Score: 0.3297
Topics: 5, Alpha: symmetric, Eta: None --> Coherence Score: 0.3383
Topics: 5, Alpha: asymmetric, Eta: symmetric --> Coherence Score: 0.3284
Topics: 5, Alpha: asymmetric, Eta: None --> Coherence Score: 0.3251
Topics: 10, Alpha: symmetric, Eta: symmetric --> Coherence Score: 0.3301
Topics: 10, Alpha: symmetric, Eta: None --> Coherence Score: 0.3182
Topics: 10, Alpha: asymmetric, Eta: symmetric --> Coherence Score: 0.3072
Topics: 10, Alpha: asymmetric, Eta: None --> Coherence Score: 0.3036
Topics: 15, Alpha: symmetric, Eta: symmetric --> Coherence Score: 0.3093
Topics: 15, Alpha: symmetric, Eta: None --> Coherence Score: 0.3081
Topics: 15, Alpha: asymmetric, Eta: symmetric --> Coherence Score: 0.3094
Topics: 15, Alpha: asymmetric, Eta: None --> Coherence Score: 0.3089

Best Model Parameters:
Topics: 5, Alpha: [0.2 0.2 0.2 0.2 0.2], Eta: [0.2 0.2 0.2 ... 0.2 0.2 0.2]
Best Coherence Score: 0.3383

LDA Topics:
Topic 1: 0