In [39]:
import json
import requests
import csv
import jsonlines
from flask import Flask, redirect, render_template, request, url_for
import sys
import numpy as np
from collections import defaultdict
import random
import math
import openai
csv.field_size_limit(2147483647)

2147483647

In [40]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [41]:
openai.api_key = ''

In [42]:
articles1_csv = csv.DictReader(open("D:/Projects/Test/All the News/data/articles1.csv"))
articles2_csv = csv.DictReader(open("D:/Projects/Test/All the News/data/articles2.csv"))
articles3_csv = csv.DictReader(open("D:/Projects/Test/All the News/data/articles3.csv"))
dataset = [datum for datum in articles1_csv] + [datum for datum in articles2_csv] + [datum for datum in articles3_csv]

In [43]:
# clean AllTheNews
dataset_2016 = [datum for datum in dataset if datum['year'] == '2016.0']
#dataset_2016 = dataset_2016[0:20]
articles_grouped = defaultdict(list)
for datum in dataset_2016:
    articles_grouped[datum['publication']].append(datum)
random_selected_dataset = []
for publication, articles in articles_grouped.items():
    total_articles = len(articles)
    random_samples = random.sample(articles, math.floor(total_articles/10))
    print(len(random_samples), total_articles)
    random_selected_dataset += random_samples
random_selected_dataset = random_selected_dataset[0:20]
save_json(random_selected_dataset, r'D:/Projects/Test/All the News/data/processed_data/2016_10p.json')

362 3628
1610 16100
410 4104
459 4596
515 5150
432 4322
248 2489
325 3254
419 4190
898 8985
428 4284
764 7647
616 6165
325 3257
723 7234


In [44]:
def call_gpt(messages, model="gpt-3.5-turbo-0613"):
    completions = openai.ChatCompletion.create(
        model=model,
        n=1,
        stop=None,
        temperature = 0.0,
        messages=messages)
    gpt_response = completions['choices'][0]['message']['content'].strip() 
    return gpt_response   

In [45]:
def merge_sentences(datum_sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum_sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph


In [48]:
def get_arguments(article, model="gpt-3.5-turbo-0613"):
    messages = [ 
        {
            "role": "system", 
            "content": """
                You are an extraction system that extracts the main characters of a news article.
                The main characters can be any organization, person or location that are heavily involved in the event described by the news article.
                The user will provide you with a news article to extract.
                Reply in the format '[character 1] [character 2]...'
            """
        },
        {
            "role": "user", "content": article
        } 
    ]
    arguments = call_gpt(messages)
    return arguments

In [49]:
def extract_events(sentence):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are an event extraction system. Please extract the events from user provided sentence.
                An 'event' should contain one or more 'participants', which are the major participants in the event,
                and a 'trigger', which is a verb that describes what happens between the participants.
                The triggers and participants should be human-readable.
                Reply with each line being an event in the format:
                [trigger1], [participant 1], [participant 2], ...
            """
        },
        { "role": "system", "name": "example_user", "content": "Trump's inability to work with people beyond his base, as demonstrated by his comparison to Saddam Hussein's Iraq, is a major problem for the United States, as it requires the president to build bridges and form alliances in order to get things done."},
        { "role": "system", "name": "example_system", "content": "Problem, Trump, United States; \n Inable, Trump, work with, people beyond his base; \n Compare, Trump, Saddam Hussein's Iraq; \n Require, president, build bridges and form alliances;"},
        { "role": "user", "content": sentence}
    ]
    events = call_gpt(messages)
    return events

In [50]:
def extract_events1(sentence,event_role):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are an event extraction system. Your task is to extract events from news articles provided by the user. 
                An 'event' should contain one or more 'participants', which are the major participants in the event,
                and a 'trigger' is a verb that describes what kind of event happens between the participants. There are categories of events, and each trigger recognized is classified as one of the categories.
            """
        },
        {
            "role": "system", 
            "content": """ The user will provide a dictionary defining 'event category' as 'keys' and the 'types of participants' that can be involved in that event by 'values' of those keys.
                The format of the input dictionary will be like "['event type 1':['participant type 1','participant type 2', ...]], ['event type 2':['participant type 1','participant type 2', ...]], ...".
                Use this input to classify triggers and participants. 
                Classify the triggers into one of the categories specified by the dictionary strictly, if none of them apply answer unknown. 
                Also classify the type of participants strictly according to the user input.
            """
        },
        { "role": "user", "content": f"This is the input dictionary:{event_role}"},
        {
            "role": "system", 
            "content": """Now extract the triggers involved in the articles provided by the user and classify them into their event types.
                The triggers and participants should be human-readable. 
                Reply with each line being an event in the format:
                [event type 1 - trigger 1],[participant type 1 : participant 1], [participant type 2 : participant 2], ...
            """
        },
        { "role": "system", "name": "example_user", "content": "The fixture anticipated by fans as the marquee game of the second day of Euro 2016 ended in a 1 - 1 tie . In chaotic scenes involving hundreds of fans throngs of rival supporters rushed at one another hurling bottles chairs and other objects and forcing police in riot gear to fire tear gas in response . Fans skirmish ahead of the match in the French port city of Marseille on Saturday . Distressing footage emerged of men kicking and stomping on another person lying on the street in broad daylight as the rivalry turned violent prior to kick - off . Then at the end of the game Russian fans charged at the section of the stadium containing England supporters and more chaos ensued."},
        { "role": "system", "name": "example_system", "content": "[conflict:attack - hurling], [attacker-fans],[target-fans],[instrument- bottles chairs and other objects],[place- Marseille]; \n [conflict:attack-fire] , [attacker-police],[target-fans],[instrument-tear gas],[place- Marseille];"},
        { "role": "user", "content": sentence}
    ]
    events = call_gpt(messages)
    return events


In [52]:
def summarize_sentence(article, arguments, model="gpt-3.5-turbo-0613"):
    messages = [ 
        {
            "role": "system", 
            "content": """
                You are an summarization system that summarizes the events that happened between the main characters of a news article.
                The user will provide you with a list of main characters and a news article to summarize.
                Try to summarize the article with no more than three sentences. 
                Reply starts with 'The article discussed ...'
            """
        },
        {
            "role": "user", "content": "Main Characters:\n{arguments} \n\n\n Article: {article}".format(arguments=arguments, article=article)
        } 
    ]
    sentence = call_gpt(messages)
    return sentence

In [53]:
def strip_sentence(sentence):
    if sentence.startswith('The article discussed how'):
        stripped_sentence = sentence.replace('The article discussed how', '').strip()
    elif sentence.startswith('The article discussed'):
        stripped_sentence = sentence.replace('The article discussed', '').strip()
    else:
        print("!!!")
    return stripped_sentence

In [54]:
dataset = json.load(open(r'D:/Projects/Test/All the News/data/processed_data/2016_10p.json'))
saved_dataset = []
count = 0
for datum in dataset:
    try:
        count += 1
        #if count == 6: break
        print("{}/{}".format(count, len(dataset)))
        saved_datum = {}
        article = datum['content']
        arguments = get_arguments(article)
        sentence = summarize_sentence(article, arguments)
        saved_datum['id'] = datum['id']
        saved_datum['content'] = datum['content']
        saved_datum['title'] = datum['title']
        saved_datum['publication'] = datum['publication']
        saved_datum['author'] = datum['author']
        saved_datum['url'] = datum['url']
        saved_datum['date'] = datum['date']
        saved_datum['summary'] = sentence
        saved_dataset.append(saved_datum)
    except:
        continue
save_json(saved_dataset, r'D:/Projects/Test/All the News/data/summarized/summary.json')

1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
20/20


In [59]:
f = open('D:/Projects/OneIE/resource/valid_patterns_rams/event_role.json')
event_role= json.load(f)

In [60]:
# AllTheNews
AllTheNews_summarized = json.load(open(r'D:/Projects/Test/All the News/data/summarized/summary.json'))
res_events = []
error_datum = []
for index, datum in enumerate(AllTheNews_summarized):
    print('{}/{}'.format(index, len(AllTheNews_summarized)))
    sentence = strip_sentence(datum['summary'])
    events = extract_events1(sentence,event_role)
    datum['events'] = events
    res_events.append(datum)
save_json(res_events, r'D:/Projects/Test/All the News/events/events1.json')

0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20


In [61]:
from string import punctuation
def post_process_events(dataset):
    for index, datum in enumerate(dataset):
        datum['doc_id'] = index
        # datum['id']=datum['id']
        # datum['content']=datum['content']
        # datum['title']=datum['title']
        # datum['publication']=datum['publication']
        # datum['author']=datum['author']
        # datum['date']=datum['date']
        datum['events_raw'] = datum['events']
        events_str = datum['events'].split('\n')
        events = []
        for event_str in events_str:
            arguments=[]
            event_str=event_str[1:-1]
            components = event_str.split(',')
            event_type_raw = components[0].split('-')
            event_type=event_type_raw[0].strip()
            trigger_raw = event_type_raw[1].split(',')
            trigger = trigger_raw[0].strip()
            trigger = trigger[0:-1]
            arguments_raw = [arg.strip().strip(punctuation) for arg in components[1:]]
            for dat in arguments_raw:
                temp = dat.split(',')
                for arg in temp:
                    arg_raw=arg.split('-')
                    arg_type=arg_raw[0].strip()
                    args = arg_raw[1:]
                    args_final=' '.join([str(elem) for elem in args]).strip()
                    arguments.append({arg_type:args_final})
            events.append({'event_type':event_type,'trigger': trigger, 'arguments': arguments})
        datum['events'] = events
    return dataset

dataset = json.load(open(r'D:/Projects/Test/All the News/events/events1.json'))
processed_dataset = post_process_events(dataset)
save_json(processed_dataset, r'D:/Projects/Test/All the News/result/result.json')
