In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [1]:
import csv
import json
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
import numpy as np
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/sp3856/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sp3856/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess_comments(comments):
    stop_words = set(stopwords.words('english'))
    cleaned_comments = []
    for comment in comments:
        comment = BeautifulSoup(comment, "html.parser").get_text()
        comment = re.sub(r"[^a-zA-Z\s]", "", comment)
        tokens = word_tokenize(comment.lower())
        tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
        cleaned_comments.append(tokens)
    #print(f"Cleaned Comments: {cleaned_comments}")
    return cleaned_comments

def get_comment_topics(lda_model, corpus, n_topics=2):
    comment_topics = []
    for i, row in enumerate(corpus):
        topics = sorted(lda_model.get_document_topics(row), key=lambda x: -x[1])[:n_topics]
        comment_topics.append(topics)
        #print(f"Comment {i + 1}: {topics}")
    return comment_topics

def get_top_topics_overall(comment_topics, num_topics=2):
    if not comment_topics or len(comment_topics[0]) < num_topics:
        return list(range(len(comment_topics[0])))
    
    num_total_topics = len(comment_topics[0])  
    topic_scores = np.zeros(num_total_topics)

    for topics in comment_topics:
        for topic, prob in topics:
            topic_scores[topic] += prob

    top_topics_indices = np.argsort(topic_scores)[-num_topics:][::-1]
    return top_topics_indices

def clean_topic_keywords(topics, top_topic_indices):
    readable_topics = []
    for idx in top_topic_indices:
        topic_id, topic_desc = topics[idx]
        keywords = [kw.split('*')[1].strip('"') for kw in topic_desc.split(' + ')]
        print(f"Topic {topic_id + 1}: {', '.join(keywords)}")
        readable_topics.append((topic_id, keywords))
    return readable_topics

input_json_file = 'hacker_news_comments_chat_openai.json'
output_csv_file = 'hacker_news_with_chatopenai_topics.csv'

with open(input_json_file, mode='r', encoding='utf8') as infile:
    data = json.load(infile)

with open(output_csv_file, mode='w', newline='', encoding='utf8') as csvfile:
    fieldnames = ["Hacker News URL", "Top Topics"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for entry in data:
        comments = entry.get("Comments", [])
        if comments:
            print(f"Processing URL: {entry['Hacker News URL']} with {len(comments)} comments.")
            
            cleaned_comments = preprocess_comments(comments)
            dictionary = Dictionary(cleaned_comments)
            #print(f"Number of unique tokens in dictionary: {len(dictionary)}")
            #print(f"Sample dictionary tokens: {list(dictionary.items())}")
            
            corpus = [dictionary.doc2bow(comment) for comment in cleaned_comments]
            #print(f"Corpus sample: {corpus}")

            if len(dictionary) > 0:
                lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
                topics = lda_model.print_topics(num_words=5)
                for idx, topic in topics:
                    print(f"Topic {idx}: {topic}")

                comment_topics = get_comment_topics(lda_model, corpus)
                if comment_topics and len(comment_topics[0]) > 0:
                    top_topics_indices = get_top_topics_overall(comment_topics, num_topics=2)
                    readable_topics = clean_topic_keywords(topics, top_topics_indices)

                    topics_str = "; ".join([f"Topic {topic_id + 1}: {', '.join(keywords)}" for topic_id, keywords in readable_topics])
                    entry["Top Topics"] = topics_str
                    print(f"Top Topics for {entry['Hacker News URL']}: {topics_str}")
                else:
                    entry["Top Topics"] = "Insufficient data for topics"
            else:
                entry["Top Topics"] = "No topics found"
        else:
            print(f"No comments available for URL: {entry['Hacker News URL']}")
            entry["Top Topics"] = "No comments available"

        writer.writerow({
            "Hacker News URL": entry["Hacker News URL"],
            "Top Topics": entry.get("Top Topics", "No topics available")
        })

print(f"Output written to {output_csv_file}")


Processing URL: https://news.ycombinator.com/item?id=37195889 with 1 comments.
Topic 0: 0.020*"chatgpt" + 0.020*"example" + 0.016*"use" + 0.016*"results" + 0.016*"google"
Topic 1: 0.008*"example" + 0.008*"chatgpt" + 0.008*"like" + 0.008*"google" + 0.008*"results"
Topic 1: chatgpt, example, use, results, google
Top Topics for https://news.ycombinator.com/item?id=37195889: Topic 1: chatgpt, example, use, results, google
No comments available for URL: https://news.ycombinator.com/item?id=37183231
No comments available for URL: https://news.ycombinator.com/item?id=37178810
Processing URL: https://news.ycombinator.com/item?id=37129724 with 5 comments.
Topic 0: 0.052*"plugin" + 0.052*"code" + 0.052*"ask" + 0.031*"repo" + 0.031*"one"
Topic 1: 0.058*"github" + 0.045*"code" + 0.045*"questions" + 0.032*"repo" + 0.032*"plugin"
Topic 2: github, code, questions, repo, plugin
Topic 1: plugin, code, ask, repo, one
Top Topics for https://news.ycombinator.com/item?id=37129724: Topic 2: github, code, qu

  comment = BeautifulSoup(comment, "html.parser").get_text()


Topic 0: 0.008*"would" + 0.007*"energy" + 0.006*"like" + 0.004*"people" + 0.004*"dont"
Topic 1: 0.007*"people" + 0.007*"think" + 0.005*"would" + 0.005*"even" + 0.004*"black"
Topic 2: people, think, would, even, black
Topic 1: would, energy, like, people, dont
Top Topics for https://news.ycombinator.com/item?id=36994214: Topic 2: people, think, would, even, black; Topic 1: would, energy, like, people, dont
Processing URL: https://news.ycombinator.com/item?id=36992985 with 7 comments.
Topic 0: 0.049*"token" + 0.049*"chat" + 0.049*"anything" + 0.029*"generates" + 0.029*"interesting"
Topic 1: 0.054*"try" + 0.054*"mistake" + 0.054*"read" + 0.054*"wrong" + 0.033*"interesting"
Topic 1: token, chat, anything, generates, interesting
Topic 2: try, mistake, read, wrong, interesting
Top Topics for https://news.ycombinator.com/item?id=36992985: Topic 1: token, chat, anything, generates, interesting; Topic 2: try, mistake, read, wrong, interesting
Processing URL: https://news.ycombinator.com/item?id