In [2]:
import json
import numpy as np
from collections import defaultdict
import openai
import re
#csv.field_size_limit(sys.maxsize)

In [3]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [4]:
api_key = open("api_key").read()
openai.api_key = api_key

In [22]:
def call_gpt(messages, model="gpt-3.5-turbo-16k-0613"):
    print(messages)
    completions = openai.ChatCompletion.create(
        model=model,
        messages=messages)
    gpt_response = completions['choices'][0]['message']['content'].strip() 
    return gpt_response   

In [16]:
def merge_sentences(datum_sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum_sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph

In [32]:
def get_arguments(article):
    messages = [ 
        {
            "role": "system", 
            "content": """
                You are an extraction system that extracts the main characters of a news article.
                The main characters can be any organization, person or location that are heavily involved in the event described by the news article.
                The user will provide you with a news article to extract.
                Reply in the format '[character 1] [character 2]...'
            """
        },
        {
            "role": "user", "content": article
        } 
    ]
    arguments = call_gpt(messages)

In [17]:
def extract_event(sentence):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are an event extraction system. Your task is to extract only the most important event from news articles. 
                Strictly extract only one event. This event should be the most important event in the article.
                Also extract the category of the event.
                The event and category should be human-readable. 
                Reply in this format: 
                [event - category]
                Do not reply more than a line. The news article will be provided by the user.
            """
        },
        { "role": "system", "name": "example_user", "content": "The article discussed the extensive history of doping in Russia, dating back to the 1983 Soviet Union's detailed instructions to inject top athletes with anabolic steroids in order to ensure dominance at the Los Angeles Olympics. Dr. Sergei Portugalov, a key figure in Russia's current doping scandal, was named as the mastermind behind the doping program. The revelations of these schemes led to the banning of Russia's track and field team from the Rio Games, the most severe doping penalty in Olympic history."},
        { "role": "system", "name": "example_system", "content": "Russia's doping scandal - sports scandal"},        
        { "role": "user", "content": sentence}
    ]
    events = call_gpt(messages)
    return events

In [7]:
def extract_participants(sentence, event):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a named entity recognition model. You will be given an article and the event recognized in that article by the user.
                The format of the input defining event in article will be:
                [event - category];
                Extract the main participants involved in that event.
                The number of main participants should be 2 or less strictly. 
                Ignore any other participants other than the two main participants.
                Be as concise as possible.
                Reply with the following format:
                [main participant 1],[main participant 2];
            """
        },   
        { "role": "system", "name": "example_user", "content": """
        Article: \n The article discussed the extensive history of doping in Russia, dating back to the 1983 Soviet Union's detailed instructions to inject top athletes with anabolic steroids in order to ensure dominance at the Los Angeles Olympics. Dr. Sergei Portugalov, a key figure in Russia's current doping scandal, was named as the mastermind behind the doping program. The revelations of these schemes led to the banning of Russia's track and field team from the Rio Games, the most severe doping penalty in Olympic history.
        Event: \n [Russia's doping scandal - sports scandal]
        """},
        { "role": "system", "name": "example_system", "content": "[Russia],[Dr. Sergei Portugalov];"},   
        { "role": "user", "content": f"Article: \n {sentence} \n Event: \n {event}".format(sentence, event)},
    ]
    participants = call_gpt(messages)
    return participants

In [29]:
def merged_all(sentence, event, participants):
    messages = [
        { 
            "role": "system", 
            "content": """
                You will be given an article and the event graph of one event in that article by the user.
                The format of the event graph will be:
                [event - trigger],[main participant 1],[main participant 2];
                Arguments are participants or elements that play specific roles within the context of an event.
                Extract the main arguments and their roles involved in each event.                
                Reply in JSON format with each line being an event in the format:
                [event - trigger],[main participant 1],[main participant 2]:[argument type 1 - argument 1],[argument type 2 - argument 2],...; 
            """
        },   
        { "role": "system", "name": "example_user", "content": "The article discussed the extensive history of doping in Russia, dating back to the 1983 Soviet Union's detailed instructions to inject top athletes with anabolic steroids in order to ensure dominance at the Los Angeles Olympics. Dr. Sergei Portugalov, a key figure in Russia's current doping scandal, was named as the mastermind behind the doping program. The revelations of these schemes led to the banning of Russia's track and field team from the Rio Games, the most severe doping penalty in Olympic history."},
        { "role": "system", "name": "example_system", "content": "[scandal-Russia's current doping scandal]: [Russia],[Dr. Sergei Portugalov]:[country - Russia],[mastermind - Dr. Sergei Portugalov],[affected team - track and field team],[event - Rio Games];\n"},   
        { "role": "user", "content": f"This is the input defining the extracted event and the main participants, strictly use only this event for further tasks  :{cl}"},  
        { "role": "user", "content": f"This is the news article:{sentence}"},
    ]
    events = call_gpt(messages)
    return events

In [21]:
def summarize_sentence(article, arguments, model="gpt-3.5-turbo-0613"):
    messages = [ 
        {
            "role": "system", 
            "content": """
                You are an summarization system that summarizes the events that happened between the main characters of a news article.
                The user will provide you with a list of main characters and a news article to summarize.
                Try to summarize the article with no more than three sentences. 
                Reply starts with 'The article discussed ...'
            """
        },
        {
            "role": "user", "content": "Main Characters:\n{arguments} \n\n\n Article: {article}".format(arguments=arguments, article=article)
        } 
    ]
    sentence = call_gpt(messages)
    return sentence

In [29]:
from string import punctuation
def strip_sentence(sentence):
    if sentence.startswith('The article discussed how'):
        stripped_sentence = sentence.replace('The article discussed how', '').strip()
        # stripped_sentence = re.sub(",","",stripped_sentence)
    elif sentence.startswith('The article discussed'):
        stripped_sentence = sentence.replace('The article discussed', '').strip()
        # stripped_sentence = re.sub(",","",stripped_sentence)
    else:
        stripped_sentence = sentence.strip()
    return stripped_sentence
def strip_gpt_response(res):
    return res.strip(punctuation)

In [9]:
AllTheNews_summarized = json.load(open(r'data/raw/AllTheNews/summarized/2016_10p.json'))
AllTheNews_extracted = json.load(open(r'data/raw/AllTheNews/events/2016_10p_0819.json'))
print(len(AllTheNews_summarized), len(AllTheNews_extracted))

7649 7604


In [33]:
import string
AllTheNews_summarized = json.load(open(r'data/raw/AllTheNews/summarized/2016_10p.json'))
AllTheNews_extracted = json.load(open(r'data/raw/AllTheNews/events/2016_10p_0819.json'))
extracted_ids = [datum['id'] for datum in AllTheNews_extracted]
error_datum = []
for index, datum in enumerate(AllTheNews_summarized):
    if datum['id'] in extracted_ids: continue
    print('{}/{}: {}'.format(index, len(AllTheNews_summarized), datum['id']))
    sentence = strip_sentence(datum['summary'])
    # gpt 
    event = extract_event(sentence)
    if "\n" in event:
        event = event.split("\n")[0]
    print(event)
    participants = extract_participants(sentence, event).split(",")
    # strip
    stripped = strip_gpt_response(event)
    if len(stripped.split(" - ")) == 1:
        event = stripped  
        event_type = "None"
    else:
        event, event_type = stripped.split(" - ")
    participants = list(map(lambda x: strip_gpt_response(x), participants))
    datum['events'] = {
        "title": event.strip(),
        "type": event_type.strip(),
        "participants": participants
    }
    print("------------------------------------------")
    AllTheNews_extracted.append(datum)
    save_json(AllTheNews_extracted, r'data/raw/AllTheNews//events/2016_10p_0819.json')

124/7649: 25675
[{'role': 'system', 'content': '\n                You are an event extraction system. Your task is to extract only the most important event from news articles. \n                Strictly extract only one event. This event should be the most important event in the article.\n                Also extract the category of the event.\n                The event and category should be human-readable. \n                Reply in this format: \n                [event - category]\n                Do not reply more than a line. The news article will be provided by the user.\n            '}, {'role': 'system', 'name': 'example_user', 'content': "The article discussed the extensive history of doping in Russia, dating back to the 1983 Soviet Union's detailed instructions to inject top athletes with anabolic steroids in order to ensure dominance at the Los Angeles Olympics. Dr. Sergei Portugalov, a key figure in Russia's current doping scandal, was named as the mastermind behind the dop

In [34]:
print(len(AllTheNews_summarized), len(AllTheNews_extracted))

7649 7649


In [31]:
from string import punctuation
def post_process_events(dataset):
    for index, datum in enumerate(dataset):
        datum['doc_id'] = index
        datum['events_raw'] = datum['events']
        events_str = datum['events'].split('\n')
        events = []
        for event_str in events_str:
            arguments=[]
            main_characters = []
            event_str = event_str.strip()
            components = event_str.split(':')
            event_type_raw = components[0].split('-')
            # event_type_p=event_type_raw[0].split('-')
            event_type = event_type_raw[0][1:]
            trigger = event_type_raw[1:2]
            trigger=' '.join([str(elem) for elem in trigger]).strip().strip(punctuation)
            if(trigger==""):
                trigger=event_type
            chars = components[1].split(',')
            # arguments_raw = [arg.strip().strip(punctuation) for arg in components[1:]]
            arguments_raw = components[2:]
            # chars = re.sub(",","",chars)
            for dat in chars:
                dat=re.sub(",","",dat)
                # print(dat)
                main_characters.append(dat)
            for args in arguments_raw:
                # args = re.sub(",","",args)
                temp = args.split('],[')
                for arg in temp:
                    # print(arg)
                    # arg=re.sub(",","",arg)
                    arg_raw=arg.split('-')
                    arg_type=arg_raw[0].strip()
                    # arg_type=re.sub(",","",arg_type)
                    args = arg_raw[1:]
                    args_final=' '.join([str(elem) for elem in args]).strip().strip(punctuation)
                    # args_final=re.sub(",","",args_final)
                    arg_type = str(arg_type).replace('[', '').replace(']', '')
                    arguments.append({arg_type:args_final})
            events.append({'event_type':event_type,'trigger': trigger, 'Main Participants': main_characters, 'Arguments': arguments})
        datum['events'] = events
    return dataset

dataset = json.load(open(r'../All the News/events/merged_test_1.json'))
processed_dataset = post_process_events(dataset)
save_json(processed_dataset, r'../All the News/result/merged_test_1.json')


['[admitted killings - four people],[possible link to deaths - seven deaths],[charged with kidnapping - Kala V. Brown],[discovered body - Charles D. Carver];']
["[history - extensive history of doping in Russia],[instructions - detailed instructions to inject top athletes with anabolic steroids],[purpose - ensure dominance at the Los Angeles Olympics],[mastermind - Dr. Sergei Portugalov],[penalty - banning of Russia's track and field team from the Rio Games];"]
["[participant 1 - young gay actor in California],[participant 2 - father in Kentucky],[conflicting views - election],[struggle - acceptance of son's sexuality];"]
['[residents - Palestinians],[responsible party - Hamas],[concern - potential targets for Israeli strikes],[criticism - Hamas];']
['[music manager - Jerry Heller],[music group - N. W. A],[record label - Ruthless Records],[album - "Straight Outta Compton"],[controversy - N. W. A\'s lyrics],[acrimony - group members and Mr. Heller];']
['[officers killed - Brent Thompson