In [None]:
import json
import requests
import csv
import jsonlines
from flask import Flask, redirect, render_template, request, url_for
import sys
import numpy as np
from collections import defaultdict
import random
import math
csv.field_size_limit(sys.maxsize)

In [None]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [None]:
def request_chatgpt(prompt):
    # input_filepath = r'./data/tweets/{}/'.format(case_name)
    original_url = "http://127.0.0.1:5000/event_extraction"
    body = {"prompt": prompt}
    response = requests.post(original_url, json=body).json()
    gpt_response = response['choices'][0]['text'].strip()
    return gpt_response
    
def request_chatgpt_gpt4(messages, functions=None):
    # input_filepath = r'./data/tweets/{}/'.format(case_name)
    original_url = "http://127.0.0.1:5000/event_extraction"
    if functions is None:
        body = {"messages": messages}
        response = requests.post(original_url, json=body).json()
        gpt_response = response['choices'][0]['message']['content'].strip()
        return gpt_response
    else:
        body = {"messages": messages, "functions": functions}
        response = requests.post(original_url, json=body).json()
        response_messages = response['choices'][0]['message']
        if response_messages.get("function_call"):
            function_args = json.loads(response_messages['function_call']['arguments'])
            print("function called!")
            return function_args
        else:
            return response_messages['content'].strip()



In [None]:
# RAMS
dev_reader = jsonlines.open(r'../data/raw/RAMS/dev.jsonlines')
dataset = [datum for datum in dev_reader]

In [None]:
# All The News 1.0
articles1_csv = csv.DictReader(open("../data/raw/AllTheNews/articles1.csv"))
articles2_csv = csv.DictReader(open("../data/raw/AllTheNews/articles2.csv"))
articles3_csv = csv.DictReader(open("../data/raw/AllTheNews/articles3.csv"))
dataset = [datum for datum in articles1_csv] + [datum for datum in articles2_csv] + [datum for datum in articles3_csv]

In [None]:
# clean AllTheNews
dataset_2016 = [datum for datum in dataset if datum['year'] == '2016.0']
articles_grouped = defaultdict(list)
for datum in dataset_2016:
    articles_grouped[datum['publication']].append(datum)
random_selected_dataset = []
for publication, articles in articles_grouped.items():
    total_articles = len(articles)
    random_samples = random.sample(articles, math.floor(total_articles/10))
    print(len(random_samples), total_articles)
    random_selected_dataset += random_samples
save_json(random_selected_dataset, r'../data/raw/AllTheNews/cleaned/2016_10p.json')

In [None]:
def merge_sentences(datum_sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum_sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph


In [None]:
def get_arguments(article, model="gpt-3.5-turbo-0613"):
    # if model == "gpt-3.5-turbo-0613":
    #     functions = [
    #         {
    #             "name": "get_characters",
    #             "description": "Get the main characters of the news article, which can be any organization, person or location",
    #             "parameters": {
    #                 "type": "object",
    #                 "properties": {
    #                     "characters": { "type": "array", "items": { "type": "string" }}
    #                 },
    #             },
    #         }
    #     ]
    #     messages = [ 
    #         {
    #             "role": "system", 
    #             "content": """
    #                 You are an extraction system that extracts the main characters of a news article.
    #                 The main characters can be any organization, person or location.
    #                 The user will provide you with a news article to extract.
    #             """
    #         },
    #         {
    #             "role": "user", "content": article
    #         } 
    #     ]
    #     arguments = request_chatgpt_gpt4(messages, functions)
    if model == "gpt-3.5-turbo-0613":
        messages = [ 
            {
                "role": "system", 
                "content": """
                    You are an extraction system that extracts the main characters of a news article.
                    The main characters can be any organization, person or location that heavily involved in the event described by the news article.
                    The user will provide you with a news article to extract.
                    Reply in the format '[character 1] [character 2]...'
                """
            },
            {
                "role": "user", "content": article
            } 
        ]
        arguments = request_chatgpt_gpt4(messages)

    else:
        prompt = """
        Below is a news article of an event.
        Please describe the main characters that the news article discussed, the character can be any organization, person or location.
        It can have one or more characters.
        Reply in the format '[character 1] [character 2]...'
        Article: \n {article}
        """.format(article=article)
        arguments = request_chatgpt(prompt)
    return arguments


In [77]:
def summarize_sentence(article, arguments, model="gpt-3.5-turbo-0613"):
    if model == "gpt-3.5-turbo-0613":
        messages = [ 
            {
                "role": "system", 
                "content": """
                    You are an summarization system that summarizes the events that happened between the main characters of a news article.
                    The user will provide you with a list of main characters and a news article to summarize.
                    Try to summarize the article with no more than three sentences. 
                    Reply starts with 'The article discussed ...'
                """
            },
            {
                "role": "user", "content": "Main Characters:\n{arguments} \n\n\n Article: {article}".format(arguments=arguments, article=article)
            } 
        ]
        sentence = request_chatgpt_gpt4(messages)
    else:
        prompt = """
        Below is a news article of an event.
        The major participants in the articles are: {participants}.
        Please describe what the article discud about them in one sentence.
        Reply starts with 'The article discussed ...'
        Article: \n {article}
        """.format(participants=", ".join(arguments), article=article)
        sentence = request_chatgpt(prompt)
    return sentence


In [79]:
# All The News
dataset = json.load(open(r'../data/raw/AllTheNews/cleaned/2016_10p.json'))
saved_dataset = []
count = 0
for datum in dataset:
    try:
        count += 1
        if count == 6: break
        print("{}/{}".format(count, len(dataset)))
        saved_datum = {}
        article = datum['content']
        arguments = get_arguments(article)
        sentence = summarize_sentence(article, arguments)
        saved_datum['id'] = datum['id']
        saved_datum['content'] = datum['content']
        saved_datum['title'] = datum['title']
        saved_datum['publication'] = datum['publication']
        saved_datum['author'] = datum['author']
        saved_datum['url'] = datum['url']
        saved_datum['date'] = datum['date']
        saved_datum['summary'] = sentence
        saved_dataset.append(saved_datum)
    except:
        continue
save_json(saved_dataset, r'../data/raw/AllTheNews/summarized/2016_10p.json')

1/8534
2/8534
3/8534
4/8534
5/8534


In [None]:
# RAMS
saved_dataset = []
for datum in dataset:
    saved_datum = {}
    article = merge_sentences(datum['sentences'])
    arguments = get_arguments(article)
    sentence = summarize_sentence(article, arguments)
    print(sentence)
    saved_datum['content'] = datum['sentences']
    saved_datum['url'] = datum['source_url']
    saved_datum['summary'] = sentence
    saved_dataset.append(saved_datum)
save_json(saved_dataset, r'../data/raw/RAMS/summarized/dev.json')

In [75]:
def strip_sentence(sentence):
    if sentence.startswith('The article discussed how'):
        stripped_sentence = sentence.replace('The article discussed how', '').strip()
    elif sentence.startswith('The article discussed'):
        stripped_sentence = sentence.replace('The article discussed', '').strip()
    else:
        print("!!!")
    return stripped_sentence

In [70]:
def extract_events(sentence):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are an event extraction system. Please extract the events from user provided sentence.
                An 'event' should contain one or more 'participants', which are the major participants in the event,
                and a 'trigger', which is a verb that describes what happens between the participants.
                The triggers and participants should be human-readable.
                Reply with each line being an event in the format:
                [trigger1], [participant 1], [participant 2], ...
            """
        },
        { "role": "system", "name": "example_user", "content": "Trump's inability to work with people beyond his base, as demonstrated by his comparison to Saddam Hussein's Iraq, is a major problem for the United States, as it requires the president to build bridges and form alliances in order to get things done."},
        { "role": "system", "name": "example_system", "content": "Problem, Trump, United States; \n Inable, Trump, work with, people beyond his base; \n Compare, Trump, Saddam Hussein's Iraq; \n Require, president, build bridges and form alliances;"},
        { "role": "user", "content": sentence}
    ]
    events = request_chatgpt_gpt4(messages)
    return events


In [80]:
# AllTheNews
AllTheNews_summarized = json.load(open(r'../data/raw/AllTheNews/summarized/2016_10p.json'))
res_events = []
error_datum = []
for index, datum in enumerate(AllTheNews_summarized):
    print('{}/{}'.format(index, len(AllTheNews_summarized)))
    sentence = strip_sentence(datum['summary'])
    events = extract_events(sentence)
    datum['events'] = events
    res_events.append(datum)
save_json(res_events, r'../data/raw/AllTheNews/events/2016_10p.json')

0/5
1/5
2/5
3/5
4/5


In [None]:
RAMS_summarized = json.load(open(r'../data/raw/RAMS/summarized/dev.json'))
res_events = []
error_datum = []
for index, datum in enumerate(RAMS_summarized):
    try:
        print('{}/{}'.format(index, len(RAMS_summarized)))
        sentence = strip_sentence(datum['summary'])
        events = extract_events(sentence)
        datum['events'] = events
        res_events.append(datum)
    except:
        error_datum.append(datum)
save_json(res_events, r'../data/raw/RAMS/events/dev.json')

In [None]:
from string import punctuation
def post_process_events(dataset):
    for index, datum in enumerate(dataset):
        datum['doc_id'] = index
        datum['events_raw'] = datum['events']
        events_str = datum['events'].split('\n')
        events = []
        for event_str in events_str:
            components = event_str.split(',')
            trigger = components[0].strip()
            arguments = [arg.strip().strip(punctuation) for arg in components[1:]]
            events.append({'trigger': trigger, 'arguments': arguments})
        datum['events'] = events
    return dataset

dataset = json.load(open(r'../data/raw/RAMS/events/dev.json'))
processed_dataset = post_process_events(dataset)
save_json(processed_dataset, r'../data/result/RAMS/gpt_events_dev.json')

