In [None]:
from collections import defaultdict
from pprint import pprint
import json
import requests
import random
import openai
import time

In [None]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

# From Sam: 
# I believe you have a different query approach. Feel free to switch to yours.
def request_chatgpt_gpt4(messages):
    url = "http://127.0.0.1:5000/event_hgraph"
    body = {"messages": messages}
    response = requests.post(url, json=body).json()
    gpt_response = response['choices'][0]['message']['content'].strip()
    return gpt_response


### Below is the prompts to generate topics

In [None]:
def clusterLabelToHyperedge(cluster_labels, partition, hyperedge_dict):
    reverse_partition = defaultdict(list)
    for node_id, cluster_label in partition.items():
        reverse_partition[str(cluster_label)].append(node_id)
    hyperedges = []
    for cluster_label in cluster_labels:
        for hyperedge_id in reverse_partition[cluster_label.split("-")[2]]:
            hyperedges.append(hyperedge_dict[hyperedge_id])

    return hyperedges

def query_leaf_topic(hyperedges):
    example = json.load(open(r'data/result/AllTheNews/cluster_summary/example.json'))
    summaries = [hyperedge['summary'] for hyperedge in hyperedges]
    summaries_message = ""
    for index, summary in enumerate(summaries):
        summaries_message += "Article {}: \n".format(index+1)
        summaries_message += summary + '\n\n\n'
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a news article summarization system. 
                The user will provide you with a set of summarized news articles, your job is to further summarize them into one noun phrase.
                Use words that are already in the articles, and try to use as few words as possible.
            """
        },
        { "role": "system", "name": "example_user", "content": example['leaf']['summaries']},
        { "role": "system", "name": "example_system", "content": example['leaf']['topic']},
        { "role": "user", "content": summaries_message}
    ]
    topic = request_chatgpt_gpt4(messages)
    return topic

def query_cluster_topic(cluster_subtopics, cluster_samples):
    example = json.load(open(r'data/result/AllTheNews/cluster_summary/example.json'))
    query = "Sub-topics: "
    sample_summaries = ""
    query += ", ".join(cluster_subtopics) + '\n\n\n'
    for index, cluster_sample in enumerate(cluster_samples):
        sample_summaries += "Article {}: \n".format(index+1)
        sample_summaries += cluster_sample['summary'] + '\n\n\n'

    messages = [
        { 
            "role": "system", 
            "content": """
                You are a news article categorization system. 
                The user will provide you with a list of sub-topics of news articles and a few examples from the sub-topics.
                Your job is to further categorize the sub-topics into a single noun-phrase that best summarizes all the sub-topics.
                Try to reuse the words in the examples.
            """
        },
        { "role": "system", "name": "example_user", "content": example['non-leaf']['summaries']},
        { "role": "system", "name": "example_system", "content": example['non-leaf']['topic']},
        { "role": "user", "content": query}
    ]
    topic = request_chatgpt_gpt4(messages)
    return topic

def add_hierarchical_topic(hierarchy, partitions, hyperedge_dict, topic_dict):
    dfs(hierarchy, partitions, hyperedge_dict, topic_dict)
    return topic_dict

def dfs(hierarchy, partitions, hyperedge_dict, topic_dict):
    level = int(hierarchy['key'].split('-')[1])
    if level == 1:
        children_labels = list(map(lambda x: x['key'], hierarchy['children']))
        hyperedges = clusterLabelToHyperedge(children_labels, partitions[0], hyperedge_dict)
        if hierarchy['key'] in topic_dict: return
        gpt_topic = query_leaf_topic(hyperedges)
        topic_dict[hierarchy['key']] = gpt_topic
        save_json(topic_dict, 'data/result/AllTheNews/hierarchical_topics.json')
        print(hierarchy['key'], gpt_topic)
        return
    else:
        sub_topic_samples = []
        all_hyperedges = []
        for child in hierarchy['children']:
            dfs(child, partitions, hyperedge_dict, topic_dict)
            level = int(child['key'].split('-')[1])
            hyperedges = clusterLabelToHyperedge([child['key']], partitions[level], hyperedge_dict)
            sample = hyperedges[0]
            sub_topic_samples.append(sample)
            all_hyperedges += hyperedges
        cluster_subtopics = [topic_dict[child['key']] for child in hierarchy['children']]
        sample_hyperedges = random.sample(all_hyperedges, min(20, len(all_hyperedges)))
        gpt_topic = query_cluster_topic(cluster_subtopics, sample_hyperedges)
        # if hierarchy['key'] in topic_dict: return
        # gpt_topic = query_leaf_topic(sub_topic_samples)
        topic_dict[hierarchy['key']] = gpt_topic
        save_json(topic_dict, 'data/result/AllTheNews/hierarchical_topics.json')
        print(hierarchy['key'], gpt_topic)
        return

### Below is where the main function begins

In [None]:
# 1. Read in hierarchy and partition
hierarchy = json.load(open('data/result/AllTheNews/network/server/ravasz_hierarchies_entity.json'))
partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions_entity.json'))

In [None]:
# 2. Read in hyperedges
hyperedges_dict = json.load(open('data/result/AllTheNews/network/articles.json'))
# 3. generate topic. hierarchical_topics.json should be empty at first
topic_dict = json.load(open('data/result/AllTheNews/hierarchical_topics.json'))
topic_dict = add_hierarchical_topic(hierarchy, partitions, hyperedges_dict, topic_dict)

### Below are testing/debugging functions. 

In [None]:
clusters = defaultdict(list)
for node_id, cluster_id in partitions[1].items():
    clusters[cluster_id].append(node_id)
hyperedges_2344 = [hyperedges_dict[hyperedge_id] for hyperedge_id in clusters[2344]]
query_leaf_topic(hyperedges_2344)

In [None]:
target_cluster = hierarchy['children'][0]['children'][0]['children'][0]['children'][0]['children'][1]
cluster_children = [child['key'] for child in target_cluster['children']]
sub_topic_samples = []
topic_dict = {}
for child in target_cluster['children']:
    level = int(child['key'].split('-')[1])
    hyperedges = clusterLabelToHyperedge([child['key']], partitions[level], hyperedges_dict)
    print(len(hyperedges), len(child['children']))
    sub_topic = query_leaf_topic(hyperedges)
    topic_dict[child['key']] = sub_topic
    sample = hyperedges[0]
    sub_topic_samples.append(sample)
pprint(topic_dict)

In [None]:
children = [752, 1069, 1070, 1478]
# cluster_subtopics = [
#     'Increasing Gun Violence in Chicago',
#     'Crime Rates and Policing Tactics',
#     'Misconceptions about Crime in the United States',
#     'Global Events and Optimism',
# ]
cluster_subtopics = [topic_dict["L-1-{}".format(cluster_label)] for cluster_label in children]
cluster_samples = []
clusters = defaultdict(list)
for node_id, cluster_label in partitions[1].items():
    clusters[cluster_label].append(node_id)
for cluster_label in children:
    cluster_samples += clusters[cluster_label]
cluster_samples = random.sample(cluster_samples, 10)
cluster_samples = [hyperedges_dict[sample] for sample in cluster_samples]
topic = query_cluster_topic(cluster_subtopics, cluster_samples)
sample_summaries = [sample['summary'] for sample in cluster_samples]
print(topic)
pprint(cluster_subtopics)
for summary in sample_summaries:
    print(summary)


In [None]:
# functions for generating few-show examples for the prompt
sample_summaries = "Sub-topics: "
sample_summaries += ", ".join(cluster_subtopics) + '\n\n\n'
for index, cluster_sample in enumerate(cluster_samples):
    sample_summaries += "Article {}: \n".format(index+1)
    sample_summaries += cluster_sample['summary'] + '\n\n\n'
example = json.load(open(r'data/result/AllTheNews/cluster_summary/example.json'))
example['non-leaf']['summaries'] = sample_summaries
example['non-leaf']['topic'] = 'Crimes in the United States'
save_json(example, r'data/result/AllTheNews/cluster_summary/example.json')