In [2]:
import json
import requests
import csv
import jsonlines
from flask import Flask, redirect, render_template, request, url_for
import sys
import numpy as np
from collections import defaultdict
import random
import math
csv.field_size_limit(sys.maxsize)

131072

In [3]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [4]:
def request_chatgpt(prompt):
    # input_filepath = r'./data/tweets/{}/'.format(case_name)
    original_url = "http://127.0.0.1:5000/event_extraction"
    body = {"prompt": prompt}
    response = requests.post(original_url, json=body).json()
    gpt_response = response['choices'][0]['text'].strip()
    return gpt_response
    
def request_chatgpt_gpt4(messages, functions=None):
    # input_filepath = r'./data/tweets/{}/'.format(case_name)
    original_url = "http://127.0.0.1:5000/event_extraction"
    if functions is None:
        body = {"messages": messages}
        response = requests.post(original_url, json=body).json()
        gpt_response = response['choices'][0]['message']['content'].strip()
        return gpt_response
    else:
        body = {"messages": messages, "functions": functions}
        response = requests.post(original_url, json=body).json()
        response_messages = response['choices'][0]['message']
        if response_messages.get("function_call"):
            function_args = json.loads(response_messages['function_call']['arguments'])
            print("function called!")
            return function_args
        else:
            return response_messages['content'].strip()



In [None]:
# RAMS
dev_reader = jsonlines.open(r'../data/raw/RAMS/dev.jsonlines')
dataset = [datum for datum in dev_reader]

In [None]:
# All The News 1.0
articles1_csv = csv.DictReader(open("../data/raw/AllTheNews/articles1.csv"))
articles2_csv = csv.DictReader(open("../data/raw/AllTheNews/articles2.csv"))
articles3_csv = csv.DictReader(open("../data/raw/AllTheNews/articles3.csv"))
dataset = [datum for datum in articles1_csv] + [datum for datum in articles2_csv] + [datum for datum in articles3_csv]

In [None]:
# clean AllTheNews
dataset_2016 = [datum for datum in dataset if datum['year'] == '2016.0']
articles_grouped = defaultdict(list)
for datum in dataset_2016:
    articles_grouped[datum['publication']].append(datum)
random_selected_dataset = []
for publication, articles in articles_grouped.items():
    total_articles = len(articles)
    random_samples = random.sample(articles, math.floor(total_articles/10))
    print(len(random_samples), total_articles)
    random_selected_dataset += random_samples
save_json(random_selected_dataset, r'../data/raw/AllTheNews/cleaned/2016_10p.json')

In [None]:
def merge_sentences(datum_sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum_sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph


In [5]:
def get_arguments(article, model="gpt-3.5-turbo-0613"):
    # if model == "gpt-3.5-turbo-0613":
    #     functions = [
    #         {
    #             "name": "get_characters",
    #             "description": "Get the main characters of the news article, which can be any organization, person or location",
    #             "parameters": {
    #                 "type": "object",
    #                 "properties": {
    #                     "characters": { "type": "array", "items": { "type": "string" }}
    #                 },
    #             },
    #         }
    #     ]
    #     messages = [ 
    #         {
    #             "role": "system", 
    #             "content": """
    #                 You are an extraction system that extracts the main characters of a news article.
    #                 The main characters can be any organization, person or location.
    #                 The user will provide you with a news article to extract.
    #             """
    #         },
    #         {
    #             "role": "user", "content": article
    #         } 
    #     ]
    #     arguments = request_chatgpt_gpt4(messages, functions)
    if model == "gpt-3.5-turbo-0613":
        messages = [ 
            {
                "role": "system", 
                "content": """
                    You are an extraction system that extracts the main characters of a news article.
                    The main characters can be any organization, person or location that heavily involved in the event described by the news article.
                    The user will provide you with a news article to extract.
                    Reply in the format '[character 1] [character 2]...'
                """
            },
            {
                "role": "user", "content": article
            } 
        ]
        arguments = request_chatgpt_gpt4(messages)

    else:
        prompt = """
        Below is a news article of an event.
        Please describe the main characters that the news article discussed, the character can be any organization, person or location.
        It can have one or more characters.
        Reply in the format '[character 1] [character 2]...'
        Article: \n {article}
        """.format(article=article)
        arguments = request_chatgpt(prompt)
    return arguments


In [6]:
def summarize_sentence(article, arguments, model="gpt-3.5-turbo-0613"):
    if model == "gpt-3.5-turbo-0613":
        messages = [ 
            {
                "role": "system", 
                "content": """
                    You are an summarization system that summarizes the events that happened between the main characters of a news article.
                    The user will provide you with a list of main characters and a news article to summarize.
                    Try to summarize the article with no more than three sentences. 
                    Reply starts with 'The article discussed ...'
                """
            },
            {
                "role": "user", "content": "Main Characters:\n{arguments} \n\n\n Article: {article}".format(arguments=arguments, article=article)
            } 
        ]
        sentence = request_chatgpt_gpt4(messages)
    else:
        prompt = """
        Below is a news article of an event.
        The major participants in the articles are: {participants}.
        Please describe what the article discud about them in one sentence.
        Reply starts with 'The article discussed ...'
        Article: \n {article}
        """.format(participants=", ".join(arguments), article=article)
        sentence = request_chatgpt(prompt)
    return sentence


In [None]:
# All The News
dataset = json.load(open(r'../data/raw/AllTheNews/cleaned/2016_10p.json'))
saved_dataset = []
count = 0
for datum in dataset:
    try:
        count += 1
        if count == 6: break
        print("{}/{}".format(count, len(dataset)))
        saved_datum = {}
        article = datum['content']
        arguments = get_arguments(article)
        sentence = summarize_sentence(article, arguments)
        saved_datum['id'] = datum['id']
        saved_datum['content'] = datum['content']
        saved_datum['title'] = datum['title']
        saved_datum['publication'] = datum['publication']
        saved_datum['author'] = datum['author']
        saved_datum['url'] = datum['url']
        saved_datum['date'] = datum['date']
        saved_datum['summary'] = sentence
        saved_dataset.append(saved_datum)
    except Exception as e:
        print(e)
        continue
save_json(saved_dataset, r'../data/raw/AllTheNews/summarized/2016_10p.json')

In [15]:
partial_dataset = json.load(open(r'../data/raw/AllTheNews/summarized/2016_10p.json'))
print(len(partial_dataset))


7649


In [None]:
# RAMS
saved_dataset = []
for datum in dataset:
    saved_datum = {}
    article = merge_sentences(datum['sentences'])
    arguments = get_arguments(article)
    sentence = summarize_sentence(article, arguments)
    print(sentence)
    saved_datum['content'] = datum['sentences']
    saved_datum['url'] = datum['source_url']
    saved_datum['summary'] = sentence
    saved_dataset.append(saved_datum)
save_json(saved_dataset, r'../data/raw/RAMS/summarized/dev.json')

In [7]:
def strip_sentence(sentence):
    if sentence.startswith('The article discussed how'):
        stripped_sentence = sentence.replace('The article discussed how', '').strip()
    elif sentence.startswith('The article discussed'):
        stripped_sentence = sentence.replace('The article discussed', '').strip()
    else:
        print("!!!")
    return stripped_sentence

In [8]:
def extract_events(sentence):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are an event extraction system. Please extract the events from user provided sentence.
                An 'event' should contain one or more 'participants', which are the major participants in the event,
                and a 'trigger', which is a verb that describes what happens between the participants.
                The triggers and participants should be human-readable.
                Reply with each line being an event in the format:
                [trigger1], [participant 1], [participant 2], ...
            """
        },
        { "role": "system", "name": "example_user", "content": "Trump's inability to work with people beyond his base, as demonstrated by his comparison to Saddam Hussein's Iraq, is a major problem for the United States, as it requires the president to build bridges and form alliances in order to get things done."},
        { "role": "system", "name": "example_system", "content": "Problem, Trump, United States; \n Inable, Trump, work with, people beyond his base; \n Compare, Trump, Saddam Hussein's Iraq; \n Require, president, build bridges and form alliances;"},
        { "role": "user", "content": sentence}
    ]
    events = request_chatgpt_gpt4(messages)
    return events


In [None]:
# AllTheNews
AllTheNews_summarized = json.load(open(r'../data/raw/AllTheNews/summarized/2016_10p.json'))
res_events = []
error_datum = []
for index, datum in enumerate(AllTheNews_summarized):
    print('{}/{}'.format(index, len(AllTheNews_summarized)))
    sentence = strip_sentence(datum['summary'])
    events = extract_events(sentence)
    datum['events'] = events
    res_events.append(datum)
save_json(res_events, r'../data/raw/AllTheNews/events/2016_10p.json')

In [None]:
RAMS_summarized = json.load(open(r'../data/raw/RAMS/summarized/dev.json'))
res_events = []
error_datum = []
for index, datum in enumerate(RAMS_summarized):
    try:
        print('{}/{}'.format(index, len(RAMS_summarized)))
        sentence = strip_sentence(datum['summary'])
        events = extract_events(sentence)
        datum['events'] = events
        res_events.append(datum)
    except:
        error_datum.append(datum)
save_json(res_events, r'../data/raw/RAMS/events/dev.json')

In [12]:
from string import punctuation
def post_process_events(dataset):
    for index, datum in enumerate(dataset):
        datum['doc_id'] = index
        datum['events_raw'] = datum['events']
        events_str = datum['events'].split('\n')
        events = []
        for event_str in events_str:
            components = event_str.split(',')
            trigger = components[0].strip()
            arguments = [arg.strip().strip(punctuation) for arg in components[1:]]
            events.append({'trigger': trigger, 'arguments': arguments})

        datum['events'] = events
    return dataset

dataset = json.load(open(r'../data/raw/AllTheNews/events/2016_10p.json'))
processed_dataset = post_process_events(dataset)
save_json(processed_dataset, r'../data/result/AllTheNews/2016_10p.json')



In [10]:
def explain_cluster(cluster_hyperedge_ids, hyperedges_dict, example_summaries, example_noun_phrase):
    summaries = [hyperedges_dict[hyperedge_id]['summary'] for hyperedge_id in cluster_hyperedge_ids]
    summaries_message = ""
    for index, summary in enumerate(summaries):
        summaries_message += "Article {}: \n".format(index+1)
        summaries_message += summary + '\n\n\n'
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a news article summarization system. 
                The user will provide you with a set of summarized news articles, your job is to further summarize them into one noun phrase.
            """
        },
        { "role": "system", "name": "example_user", "content": example_summaries},
        { "role": "system", "name": "example_system", "content": example_noun_phrase},
        { "role": "user", "content": summaries_message}
    ]
    cluster_summary = request_chatgpt_gpt4(messages)
    return cluster_summary

# def get_doc_id_partitions(partitions, hyperedges):
#     doc_id_partitions = []
#     for level in range(len(partitions)):
#         doc_id_labels = {}
#         for hyperedge_id, hyperedge in hyperedges.items():
#             doc_id = hyperedge['doc_id']
#             partition = partitions[level][hyperedge_id]
#             doc_id_labels[doc_id] = partition
#         doc_id_partitions.append(doc_id_labels)
#     return doc_id_partitions

def partition_to_cluster(partition):
    cluster_hyperedges = defaultdict(list)
    for hyperedge_id, cluster_label in partition.items():
        cluster_hyperedges[cluster_label].append(hyperedge_id)
    return cluster_hyperedges

def generate_summary_message(cluster_hyperedges, hyperedges):
    summaries = [hyperedges[hyperedge_id]['summary'] for hyperedge_id in cluster_hyperedges]
    summaries_message = ""
    for index, summary in enumerate(summaries):
        summaries_message += "Article {}: \n".format(index+1)
        summaries_message += summary + '\n\n\n'
    return summaries_message

In [6]:
partitions = json.load(open(r'../data/result/AllTheNews/network/ravasz_partitions.json'))
hyperedges = json.load(open(r'../data/result/AllTheNews/network/hyperedges.json'))
# doc_id_partitions = get_doc_id_partitions(partitions, hyperedges)
level_0_partition = partitions[0]
level_0_clusters = partition_to_cluster(level_0_partition)

In [36]:
level_1_partition = partitions[1]
level_1_clusters = partition_to_cluster(level_1_partition)

In [34]:
example_summaries_message = generate_summary_message(level_0_clusters[115], hyperedges)
example_topic = "Robotic Advancements and Concerns"
res = {}
count = 0
for cluster_label, hyperedge_ids in level_0_clusters.items():
    count += 1
    print("{}/{}".format(count, len(level_0_clusters)))
    try:
        explanation = explain_cluster(hyperedge_ids, hyperedges, example_summaries_message, example_topic)
        print(explanation)
        res[cluster_label] = {
            'cluster_label': cluster_label,
            'hyperedge_ids': hyperedge_ids,
            'explanation': explanation,
        }
    except:
        continue
save_json(res, r'../data/result/AllTheNews/cluster_summary/level0.json')

1/2390
Rising Violence in Chicago and Major Cities
2/2390
Uber's Coding Test in Seattle
3/2390
Uber Surge Pricing and User Experience
4/2390
Protests Against Trump's Election
5/2390
Robotic Advancements and Concerns
6/2390
The Orlando Shooting and Gun Control Debate
7/2390
Homelessness and Solutions
8/2390
Impact of GE's Move to Boston and Bernie Sanders' Criticism
9/2390
Swimming Pool Culture in Boston and Mumbai
10/2390
Gentrification and Housing Affordability
11/2390
Ongoing Conflicts in Yemen and South Sudan
12/2390
New York City's Presidential Connections
13/2390
Controversial Allegations Regarding Bill Clinton's Son
14/2390
Potential Threat in New York City
15/2390
MTA's L Train Shutdown and Property Impacts
16/2390
Public Transportation Issues in Major Cities
17/2390
Obama's final press conference and potential actions against Russia
18/2390
Protests against Milo Yiannopoulos and Free Speech at Universities
19/2390
Apple's Stock and Boeing's Production
20/2390
Illinois Credit Do

FileNotFoundError: [Errno 2] No such file or directory: '../data/result/AllTheNews/cluster_summary/level0.json'

In [35]:
save_json(res, r'../data/result/AllTheNews/cluster_summary/level0.json')

In [37]:
article_num_list = []
for cluster_label, hyperedge_ids in level_1_clusters.items():
    article_num_list.append(len(hyperedge_ids))
    print(cluster_label, len(hyperedge_ids))
print(max(article_num_list))

199 13
26 3
7 2
231 49
230 81
17 27
25 3
442 1
245 19
161 60
43 1
395 32
4 1
320 16
119 40
305 64
203 29
314 4
235 1
421 25
179 28
439 33
10 25
476 3
183 55
348 30
291 17
60 87
261 5
331 26
108 1
65 24
221 10
117 97
477 7
241 26
514 18
418 1
37 229
129 1
164 6
32 1
233 3
176 32
160 74
86 1
239 18
124 62
406 49
295 6
402 1
193 2
44 17
279 3
2 3
208 12
56 2
49 43
417 38
58 37
181 101
225 42
27 19
345 21
220 1
472 13
384 1
97 1
392 8
209 9
388 12
41 3
286 24
512 5
447 1
125 124
194 29
81 2
76 27
36 20
73 3
101 35
130 3
147 93
145 16
70 9
274 2
368 7
361 5
53 22
180 1
66 64
337 1
47 61
107 5
113 1
45 319
121 59
140 1
131 7
51 19
165 10
88 1
109 1
134 19
455 27
116 1
192 9
68 1
196 12
223 14
155 49
166 2
128 2
248 93
332 26
318 1
190 4
112 12
144 9
167 7
407 5
411 3
188 24
301 2
135 13
207 1
174 7
136 2
510 13
82 5
74 9
127 3
151 19
163 2
85 16
92 11
126 2
171 5
102 37
158 70
184 12
159 36
330 1
364 12
170 21
154 24
386 1
210 5
456 5
360 10
219 4
213 1
90 5
93 16
104 4
123 1
300 7
133 6
99 

In [45]:
# example_summaries_message = generate_summary_message(level_0_clusters[2], hyperedges)
# example_topic = "Gun Violence Incidents"
# example_summaries_message = generate_summary_message(level_0_clusters[115], hyperedges)
# example_topic = "Robotic Advancements and Concerns"
example_summaries_message = generate_summary_message(level_1_clusters[199], hyperedges)
example_topic = "Increase in Violent Crime in US Cities"

explanation = explain_cluster(level_1_clusters[349], hyperedges, example_summaries_message, example_topic)
print(explanation)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [43]:
example_summaries_message = generate_summary_message(level_1_clusters[245], hyperedges)
print(example_summaries_message)

Article 1: 
The article discussed the U.S. military's withdrawal of personnel from Saudi Arabia who were coordinating with the air campaign in Yemen, reducing Washington's involvement in advising a campaign that has caused civilian casualties. The Pentagon acknowledged concerns about the conflict and expressed the need to minimize civilian casualties. Additionally, the article mentioned the attack on a hospital operated by Medecins Sans Frontieres in Yemen, which led to the evacuation of staff and raised concerns about arms transfers to Saudi Arabia. In another news, government supporters in Venezuela attacked and besieged lawmakers in the country's congress.


Article 2: 
The article discussed the White House's immediate review of Washington's support for the Arab campaign against Yemen's Houthis after an air strike on a funeral gathering, which killed 140 people. The attack poses more trouble for the coalition and indirectly hands a propaganda win to Tehran, a Houthi ally. The articl

In [17]:
for cluster_label, ids in level_0_clusters.items():
    print(cluster_label, len(ids))

2 8
0 1
8 2
1304 2
115 8
542 10
1540 6
617 3
1098 1
441 4
513 7
235 1
14 1
2167 3
1 1
1861 8
64 12
604 8
1216 6
892 1
445 1
1505 8
485 3
1114 9
884 9
2018 2
111 7
839 3
1089 1
968 8
3 3
2380 8
378 15
34 1
851 3
167 1
2243 3
1788 5
48 8
1196 5
486 5
1269 3
1016 1
73 8
60 1
130 2
131 1
278 1
27 1
642 3
958 9
1346 5
2008 3
24 1
555 3
4 1
2263 12
109 3
1383 5
928 3
1894 5
1697 1
847 2
2204 2
420 1
1204 1
1536 3
1234 2
1502 2
1565 3
1253 2
646 2
175 2
814 1
2111 3
421 4
58 6
1379 3
2212 9
5 4
6 13
751 2
1864 1
7 1
72 1
1006 1
664 6
1038 4
1121 2
1871 3
9 2
322 5
1722 2
998 3
670 5
1461 1
1680 1
71 8
301 10
26 5
1027 2
360 1
57 3
283 18
10 2
295 3
1421 8
84 3
11 1
195 14
363 13
265 5
560 3
346 8
241 25
722 2
12 7
47 3
440 4
2118 4
49 1
1932 3
13 2
208 1
90 3
92 11
2215 3
2007 1
380 5
219 6
129 3
45 1
190 10
122 19
50 3
81 1
15 8
796 8
1317 6
144 1
249 9
797 3
140 7
369 1
414 3
33 5
20 7
16 3
44 1
117 1
399 5
125 19
362 6
403 1
999 1
402 2
543 8
831 7
541 1
832 8
1908 2
19 1
187 8
150 1
898 4

In [None]:
example_summaries_message = generate_summary_message(level_0_clusters[115], hyperedges)
example_topic = "Robotic Advancements and Concerns"
example = {
    "summaries": example_summaries_message,
    "topic": example_topic
}
save_json(example, r'../data/result/AllTheNews/cluster_summary/example.json')

