In [115]:
from collections import defaultdict
from pprint import pprint
import json
import requests
import random
import openai
import time

In [10]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

def request_chatgpt_gpt4(messages):
    url = "http://127.0.0.1:5000/event_hgraph"
    body = {"messages": messages}
    response = requests.post(url, json=body).json()
    gpt_response = response['choices'][0]['message']['content'].strip()
    return gpt_response


In [156]:
def clusterLabelToHyperedge(cluster_labels, partition, hyperedge_dict):
    reverse_partition = defaultdict(list)
    for node_id, cluster_label in partition.items():
        reverse_partition[str(cluster_label)].append(node_id)
    hyperedges = []
    for cluster_label in cluster_labels:
        for hyperedge_id in reverse_partition[cluster_label.split("-")[2]]:
            hyperedges.append(hyperedge_dict[hyperedge_id])

    return hyperedges

def query_leaf_topic(hyperedges):
    example = json.load(open(r'data/result/AllTheNews/cluster_summary/example.json'))
    summaries = [hyperedge['summary'] for hyperedge in hyperedges]
    summaries_message = ""
    for index, summary in enumerate(summaries):
        summaries_message += "Article {}: \n".format(index+1)
        summaries_message += summary + '\n\n\n'
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a news article summarization system. 
                The user will provide you with a set of summarized news articles, your job is to further summarize them into one noun phrase.
                Use words that are already in the articles, and try to use as few words as possible.
            """
        },
        { "role": "system", "name": "example_user", "content": example['leaf']['summaries']},
        { "role": "system", "name": "example_system", "content": example['leaf']['topic']},
        { "role": "user", "content": summaries_message}
    ]
    topic = request_chatgpt_gpt4(messages)
    return topic

def query_cluster_topic(cluster_subtopics, cluster_samples):
    example = json.load(open(r'data/result/AllTheNews/cluster_summary/example.json'))
    query = "Sub-topics: "
    sample_summaries = ""
    query += ", ".join(cluster_subtopics) + '\n\n\n'
    for index, cluster_sample in enumerate(cluster_samples):
        sample_summaries += "Article {}: \n".format(index+1)
        sample_summaries += cluster_sample['summary'] + '\n\n\n'

    messages = [
        { 
            "role": "system", 
            "content": """
                You are a news article categorization system. 
                The user will provide you with a list of sub-topics of news articles and a few examples from the sub-topics.
                Your job is to further categorize the sub-topics into a single noun-phrase that best summarizes all the sub-topics.
                Try to reuse the words in the examples.
            """
        },
        { "role": "system", "name": "example_user", "content": example['non-leaf']['summaries']},
        { "role": "system", "name": "example_system", "content": example['non-leaf']['topic']},
        { "role": "user", "content": query}
    ]
    topic = request_chatgpt_gpt4(messages)
    return topic

def add_hierarchical_topic(hierarchy, partitions, hyperedge_dict, topic_dict):
    dfs(hierarchy, partitions, hyperedge_dict, topic_dict)
    return topic_dict

def dfs(hierarchy, partitions, hyperedge_dict, topic_dict):
    level = int(hierarchy['key'].split('-')[1])
    if level == 1:
        children_labels = list(map(lambda x: x['key'], hierarchy['children']))
        hyperedges = clusterLabelToHyperedge(children_labels, partitions[0], hyperedge_dict)
        if hierarchy['key'] in topic_dict: return
        gpt_topic = query_leaf_topic(hyperedges)
        topic_dict[hierarchy['key']] = gpt_topic
        save_json(topic_dict, 'data/result/AllTheNews/hierarchical_topics.json')
        print(hierarchy['key'], gpt_topic)
        return
    else:
        sub_topic_samples = []
        all_hyperedges = []
        for child in hierarchy['children']:
            dfs(child, partitions, hyperedge_dict, topic_dict)
            level = int(child['key'].split('-')[1])
            hyperedges = clusterLabelToHyperedge([child['key']], partitions[level], hyperedge_dict)
            sample = hyperedges[0]
            sub_topic_samples.append(sample)
            all_hyperedges += hyperedges
        cluster_subtopics = [topic_dict[child['key']] for child in hierarchy['children']]
        sample_hyperedges = random.sample(all_hyperedges, min(20, len(all_hyperedges)))
        gpt_topic = query_cluster_topic(cluster_subtopics, sample_hyperedges)
        # if hierarchy['key'] in topic_dict: return
        # gpt_topic = query_leaf_topic(sub_topic_samples)
        topic_dict[hierarchy['key']] = gpt_topic
        save_json(topic_dict, 'data/result/AllTheNews/hierarchical_topics.json')
        print(hierarchy['key'], gpt_topic)
        return

In [44]:
# hierarchical topics
hierarchy = json.load(open('data/result/AllTheNews/network/server/ravasz_hierarchies.json'))
partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions.json'))
hyperedges_dict = json.load(open('data/result/AllTheNews/network/hyperedges.json'))

In [158]:
topic_dict = json.load(open('data/result/AllTheNews/hierarchical_topics.json'))
topic_dict = add_hierarchical_topic(hierarchy, partitions, hyperedges_dict, topic_dict)

L-2-199 Crime in the United States
L-2-86 The Cannabis Industry
L-2-410 Recognition and Controversies in the Film Industry
L-3-3 Criminal Justice in the United States
L-2-7 Protests against the Trump Election
L-3-2 Political Protests against the Trump Election
L-2-230 Gun Control Debates and Controversies
L-2-305 Controversies in Higher Education and Campus Free Speech
L-2-179 Tragedies and Disaster Recovery in the United States
L-2-10 Controversial Events in the Trump Era
L-2-291 Water Contamination Crisis and Environmental Mismanagement in the United States
L-2-108 Misconceptions about Crime in New York City
L-2-160 Crime in the United States
L-2-56 Media Industry Consolidation in the United States
L-2-58 Political Campaigns and Controversies in the United States
L-2-220 Public Toilet Solutions
L-2-209 Ethics Concerns in Virginia's Government and Felon Voting Rights
L-2-41 Donald Trump's Golf Course Controversies
L-2-194 Efforts to combat Climate Change
L-2-70 Political Divisions in 

In [119]:
pprint(topic_dict)

{}


In [11]:
# find level 1... clusters ids and then test  
clusters = defaultdict(list)
for node_id, cluster_id in partitions[1].items():
    clusters[cluster_id].append(node_id)
hyperedges_2344 = [hyperedges_dict[hyperedge_id] for hyperedge_id in clusters[2344]]
query_leaf_topic(hyperedges_2344)

'Police Shootings and Racial Tensions'

In [13]:
hyperedges_2 = [hyperedges_dict[hyperedge_id] for hyperedge_id in clusters[2]]
query_leaf_topic(hyperedges_2)

'Increasing Violence in Chicago and Major Cities'

In [36]:
hyperedges_2353 = [hyperedges_dict[hyperedge_id] for hyperedge_id in clusters[2353]]
query_leaf_topic(hyperedges_2353)

'Controversial Legal Cases and Their Consequences'

In [None]:
hyperedges_2 = [hyperedges_dict[hyperedge_id] for hyperedge_id in clusters[2]]
query_leaf_topic(hyperedges_2)

In [37]:
cluster = [
    'Police Shootings and Racial Tensions', 
    'Increasing Violence in Chicago and Major Cities', 
    'Controversial Legal Cases and Their Consequences'
]
query_cluster_topic(cluster)

'Topic: Criminal Justice System and Law Enforcement.'

In [145]:
target_cluster = hierarchy['children'][0]['children'][0]['children'][0]['children'][0]['children'][2]
cluster_children = [child['key'] for child in target_cluster['children']]
print(target_cluster['key'], cluster_children)

L-2-410 ['L-1-752', 'L-1-1069', 'L-1-1070', 'L-1-1478']


In [98]:
# use this to trim tree for testing
target_cluster = hierarchy['children'][0]['children'][0]['children'][0]['children'][0]['children'][1]
cluster_children = [child['key'] for child in target_cluster['children']]
sub_topic_samples = []
topic_dict = {}
for child in target_cluster['children']:
    level = int(child['key'].split('-')[1])
    hyperedges = clusterLabelToHyperedge([child['key']], partitions[level], hyperedges_dict)
    print(len(hyperedges), len(child['children']))
    sub_topic = query_leaf_topic(hyperedges)
    topic_dict[child['key']] = sub_topic
    sample = hyperedges[0]
    sub_topic_samples.append(sample)
pprint(topic_dict)

8 8
1 1
1 1
3 3
{'L-1-2': 'Rising Violence in Chicago',
 'L-1-278': 'Debate over Crime Statistics and Policing Tactics',
 'L-1-287': 'American Perception of Crime Rates',
 'L-1-470': 'Events of 2016 and perspective'}


In [99]:
gpt_topic = query_leaf_topic(sub_topic_samples)
topic_dict[target_cluster['key']] = gpt_topic

In [100]:
pprint(topic_dict)

{'L-1-2': 'Rising Violence in Chicago',
 'L-1-278': 'Debate over Crime Statistics and Policing Tactics',
 'L-1-287': 'American Perception of Crime Rates',
 'L-1-470': 'Events of 2016 and perspective',
 'L-2-199': 'Violence and Crime Trends'}


In [148]:
children = [752, 1069, 1070, 1478]
# cluster_subtopics = [
#     'Increasing Gun Violence in Chicago',
#     'Crime Rates and Policing Tactics',
#     'Misconceptions about Crime in the United States',
#     'Global Events and Optimism',
# ]
cluster_subtopics = [topic_dict["L-1-{}".format(cluster_label)] for cluster_label in children]
cluster_samples = []
clusters = defaultdict(list)
for node_id, cluster_label in partitions[1].items():
    clusters[cluster_label].append(node_id)
for cluster_label in children:
    cluster_samples += clusters[cluster_label]
cluster_samples = random.sample(cluster_samples, 10)
cluster_samples = [hyperedges_dict[sample] for sample in cluster_samples]
topic = query_cluster_topic(cluster_subtopics, cluster_samples)
sample_summaries = [sample['summary'] for sample in cluster_samples]
print(topic)
pprint(cluster_subtopics)
for summary in sample_summaries:
    print(summary)


Oscar Controversies and Recognition
['Oscars Diversity Controversy and Backlash',
 'Diversity and Recognition in Hollywood',
 'Film Industry Recognition and Controversies',
 "Leonardo DiCaprio's Oscar Journey"]
The article discussed the backlash against the Academy for not nominating any person of color in the main acting categories at the Oscars. Rev. Al Sharpton announced that he will hold a demonstration near the Dolby Theater in Los Angeles, and the National Action Network is organizing rallies in multiple cities. Twitter users expressed their disappointment using the hashtag #OscarsSoWhite, specifically mentioning the snub of actors Will Smith for Concussion and Idris Elba for Beasts of No Nation.
The article discussed how Pope Francis honored actors George Clooney and Richard Gere, as well as actress Salma Hayek, by awarding them the "Olive Medal" for peace at a Vatican event. The event was organized to promote the work of a foundation called Scholas Occurrentes, which aims to pr

In [None]:

L-2-410 ['L-1-752', 'L-1-1069', 'L-1-1070', 'L-1-1478']

In [None]:
sample_summaries = "Sub-topics: "
sample_summaries += ", ".join(cluster_subtopics) + '\n\n\n'
for index, cluster_sample in enumerate(cluster_samples):
    sample_summaries += "Article {}: \n".format(index+1)
    sample_summaries += cluster_sample['summary'] + '\n\n\n'
example = json.load(open(r'data/result/AllTheNews/cluster_summary/example.json'))
example['non-leaf']['summaries'] = sample_summaries
example['non-leaf']['topic'] = 'Crimes in the United States'
save_json(example, r'data/result/AllTheNews/cluster_summary/example.json')

In [134]:
topic_dict = json.load(open('data/result/AllTheNews/network/server/hierarchical_topics.json'))
print(topic_dict['L-1-2'])
print(topic_dict['L-1-278'])
print(topic_dict['L-1-287'])
print(topic_dict['L-1-470'])
topic_dict['L-2-199']

Increasing Gun Violence in Chicago
Crime Rates and Policing Tactics
Misconceptions about Crime in the United States
Global Events and Optimism


'Violent crime rates and political narratives'