In [2]:
import json
import numpy as np
from collections import defaultdict
import openai
import re
#csv.field_size_limit(sys.maxsize)

In [3]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [4]:
api_key = open("api_key").read()
openai.api_key = api_key

In [5]:
def call_gpt(messages, model="gpt-3.5-turbo-16k-0613"):
    print(messages)
    completions = openai.ChatCompletion.create(
        model=model,
        messages=messages)
    gpt_response = completions['choices'][0]['message']['content'].strip() 
    return gpt_response   

In [None]:
def merge_sentences(datum_sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum_sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph

In [None]:
def get_arguments(article):
    messages = [ 
        {
            "role": "system", 
            "content": """
                You are an extraction system that extracts the main characters of a news article.
                The main characters can be any organization, person or location that are heavily involved in the event described by the news article.
                The user will provide you with a news article to extract.
                Reply in the format '[character 1] [character 2]...'
            """
        },
        {
            "role": "user", "content": article
        } 
    ]
    arguments = call_gpt(messages)

In [None]:
def extract_event(sentence):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are an event extraction system. Your task is to extract only the most important event from news articles. 
                Strictly extract only one event. This event should be the most important event in the article.
                Also extract the category of the event.
                The event and category should be human-readable. 
                Reply in this format: 
                [event - category]
                Do not reply more than a line. The news article will be provided by the user.
            """
        },
        { "role": "system", "name": "example_user", "content": "The article discussed the extensive history of doping in Russia, dating back to the 1983 Soviet Union's detailed instructions to inject top athletes with anabolic steroids in order to ensure dominance at the Los Angeles Olympics. Dr. Sergei Portugalov, a key figure in Russia's current doping scandal, was named as the mastermind behind the doping program. The revelations of these schemes led to the banning of Russia's track and field team from the Rio Games, the most severe doping penalty in Olympic history."},
        { "role": "system", "name": "example_system", "content": "Russia's doping scandal - sports scandal"},        
        { "role": "user", "content": sentence}
    ]
    events = call_gpt(messages)
    return events

In [None]:
def extract_participants(sentence, event):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a named entity recognition model. You will be given an article and the event recognized in that article by the user.
                The format of the input defining event in article will be:
                [event - category];
                Extract the main participants involved in that event.
                The number of main participants should be 2 or less strictly. 
                Ignore any other participants other than the two main participants.
                Be as concise as possible.
                Reply with the following format:
                [main participant 1],[main participant 2];
            """
        },   
        { "role": "system", "name": "example_user", "content": """
        Article: \n The article discussed the extensive history of doping in Russia, dating back to the 1983 Soviet Union's detailed instructions to inject top athletes with anabolic steroids in order to ensure dominance at the Los Angeles Olympics. Dr. Sergei Portugalov, a key figure in Russia's current doping scandal, was named as the mastermind behind the doping program. The revelations of these schemes led to the banning of Russia's track and field team from the Rio Games, the most severe doping penalty in Olympic history.
        Event: \n [Russia's doping scandal - sports scandal]
        """},
        { "role": "system", "name": "example_system", "content": "[Russia],[Dr. Sergei Portugalov];"},   
        { "role": "user", "content": f"Article: \n {sentence} \n Event: \n {event}".format(sentence, event)},
    ]
    participants = call_gpt(messages)
    return participants

In [None]:
def merged_all(sentence, event, participants):
    messages = [
        { 
            "role": "system", 
            "content": """
                You will be given an article and the event graph of one event in that article by the user.
                The format of the event graph will be:
                [event - trigger],[main participant 1],[main participant 2];
                Arguments are participants or elements that play specific roles within the context of an event.
                Extract the main arguments and their roles involved in each event.                
                Reply in JSON format with each line being an event in the format:
                [event - trigger],[main participant 1],[main participant 2]:[argument type 1 - argument 1],[argument type 2 - argument 2],...; 
            """
        },   
        { "role": "system", "name": "example_user", "content": "The article discussed the extensive history of doping in Russia, dating back to the 1983 Soviet Union's detailed instructions to inject top athletes with anabolic steroids in order to ensure dominance at the Los Angeles Olympics. Dr. Sergei Portugalov, a key figure in Russia's current doping scandal, was named as the mastermind behind the doping program. The revelations of these schemes led to the banning of Russia's track and field team from the Rio Games, the most severe doping penalty in Olympic history."},
        { "role": "system", "name": "example_system", "content": "[scandal-Russia's current doping scandal]: [Russia],[Dr. Sergei Portugalov]:[country - Russia],[mastermind - Dr. Sergei Portugalov],[affected team - track and field team],[event - Rio Games];\n"},   
        { "role": "user", "content": f"This is the input defining the extracted event and the main participants, strictly use only this event for further tasks  :{cl}"},  
        { "role": "user", "content": f"This is the news article:{sentence}"},
    ]
    events = call_gpt(messages)
    return events

In [None]:
def summarize_sentence(article, arguments, model="gpt-3.5-turbo-0613"):
    messages = [ 
        {
            "role": "system", 
            "content": """
                You are an summarization system that summarizes the events that happened between the main characters of a news article.
                The user will provide you with a list of main characters and a news article to summarize.
                Try to summarize the article with no more than three sentences. 
                Reply starts with 'The article discussed ...'
            """
        },
        {
            "role": "user", "content": "Main Characters:\n{arguments} \n\n\n Article: {article}".format(arguments=arguments, article=article)
        } 
    ]
    sentence = call_gpt(messages)
    return sentence

In [None]:
from string import punctuation
def strip_sentence(sentence):
    if sentence.startswith('The article discussed how'):
        stripped_sentence = sentence.replace('The article discussed how', '').strip()
        # stripped_sentence = re.sub(",","",stripped_sentence)
    elif sentence.startswith('The article discussed'):
        stripped_sentence = sentence.replace('The article discussed', '').strip()
        # stripped_sentence = re.sub(",","",stripped_sentence)
    else:
        stripped_sentence = sentence.strip()
    return stripped_sentence
def strip_gpt_response(res):
    return res.strip(punctuation)

In [None]:
AllTheNews_summarized = json.load(open(r'data/raw/AllTheNews/summarized/2016_10p.json'))
AllTheNews_extracted = json.load(open(r'data/raw/AllTheNews/events/2016_10p_0819.json'))
print(len(AllTheNews_summarized), len(AllTheNews_extracted))

In [None]:
import string
AllTheNews_summarized = json.load(open(r'data/raw/AllTheNews/summarized/2016_10p.json'))
AllTheNews_extracted = json.load(open(r'data/raw/AllTheNews/events/2016_10p_0819.json'))
extracted_ids = [datum['id'] for datum in AllTheNews_extracted]
error_datum = []
for index, datum in enumerate(AllTheNews_summarized):
    if datum['id'] in extracted_ids: continue
    print('{}/{}: {}'.format(index, len(AllTheNews_summarized), datum['id']))
    sentence = strip_sentence(datum['summary'])
    # gpt 
    event = extract_event(sentence)
    if "\n" in event:
        event = event.split("\n")[0]
    print(event)
    participants = extract_participants(sentence, event).split(",")
    # strip
    stripped = strip_gpt_response(event)
    if len(stripped.split(" - ")) == 1:
        event = stripped  
        event_type = "None"
    else:
        event, event_type = stripped.split(" - ")
    participants = list(map(lambda x: strip_gpt_response(x), participants))
    datum['events'] = {
        "title": event.strip(),
        "type": event_type.strip(),
        "participants": participants
    }
    print("------------------------------------------")
    AllTheNews_extracted.append(datum)
    save_json(AllTheNews_extracted, r'data/raw/AllTheNews//events/2016_10p_0819.json')

In [None]:
print(len(AllTheNews_summarized), len(AllTheNews_extracted))

In [None]:
from string import punctuation
def post_process_events(dataset):
    for index, datum in enumerate(dataset):
        datum['doc_id'] = index
        datum['events_raw'] = datum['events']
        events_str = datum['events'].split('\n')
        events = []
        for event_str in events_str:
            arguments=[]
            main_characters = []
            event_str = event_str.strip()
            components = event_str.split(':')
            event_type_raw = components[0].split('-')
            # event_type_p=event_type_raw[0].split('-')
            event_type = event_type_raw[0][1:]
            trigger = event_type_raw[1:2]
            trigger=' '.join([str(elem) for elem in trigger]).strip().strip(punctuation)
            if(trigger==""):
                trigger=event_type
            chars = components[1].split(',')
            # arguments_raw = [arg.strip().strip(punctuation) for arg in components[1:]]
            arguments_raw = components[2:]
            # chars = re.sub(",","",chars)
            for dat in chars:
                dat=re.sub(",","",dat)
                # print(dat)
                main_characters.append(dat)
            for args in arguments_raw:
                # args = re.sub(",","",args)
                temp = args.split('],[')
                for arg in temp:
                    # print(arg)
                    # arg=re.sub(",","",arg)
                    arg_raw=arg.split('-')
                    arg_type=arg_raw[0].strip()
                    # arg_type=re.sub(",","",arg_type)
                    args = arg_raw[1:]
                    args_final=' '.join([str(elem) for elem in args]).strip().strip(punctuation)
                    # args_final=re.sub(",","",args_final)
                    arg_type = str(arg_type).replace('[', '').replace(']', '')
                    arguments.append({arg_type:args_final})
            events.append({'event_type':event_type,'trigger': trigger, 'Main Participants': main_characters, 'Arguments': arguments})
        datum['events'] = events
    return dataset

dataset = json.load(open(r'../All the News/events/merged_test_1.json'))
processed_dataset = post_process_events(dataset)
save_json(processed_dataset, r'../All the News/result/merged_test_1.json')


In [1]:
# VisPub
import csv
raw_vispub =  csv.DictReader(open('data/raw/VisPub/IEEE VIS papers 1990-2022 - Main dataset-2.csv'))
articles = [article for article in raw_vispub]

In [2]:
from collections import defaultdict
keyword_count = defaultdict(list)
filtered_articles = []
for index, article in enumerate(articles):
    abstract = article['Abstract']
    keywords = article['AuthorKeywords']
    if abstract == "" or keywords == "": continue
    article['id'] = index
    for keyword in keywords.split(","):
        if keyword == "": continue
        keyword_count[keyword.strip().lower()].append(index) 
    filtered_articles.append(article)
# save_json(filtered_articles, r'data/raw/VisPub/filtered_articles.json')

In [3]:
print(len(filtered_articles))

2632


In [17]:
# filtered_keywords = {k: v for k, v in keyword_count.items() if len(v) > 1}
sorted_keywords = {k: v for k, v in sorted(keyword_count.items(), key=lambda item: len(item[1]))}


In [18]:
print(len(sorted_keywords))
print(sorted_keywords)


6043
{'photosensitive epilepsy': [0], 'photosensitivity': [0], 'federated learning': [1], 'data heterogeneity': [1], 'self-service data transformation': [2], 'programming by example': [2], 'aesthetic pleasure': [3], 'validated scale': [3], 'scale development': [3], 'visual representations': [3], 'neural network architecture search': [4], 'theoretical and empirical research': [5], 'confidence intervals': [6], 'hierarchical tabular data': [7], 'music mood classification': [8], 'physicalization': [9], 'kirigami': [9], 'visual representation design': [9], 'art & graphic design': [9], 'deficit thinking': [11], 'fundamental attribution error': [11], 'correspondence bias': [11], 'equity': [11], 'augmented merge tree': [12], 'beliefs': [14], 'motivated perception': [14], 'tracking & transformation': [15], 'dendrograms': [16], 'cyber-physical networks': [16], 'node-based visualization': [17], 'basketball tracking data': [18], 'off-ball movement analysis': [18], 'high dimensional data visualizat

In [15]:
save_json(sorted_keywords, r'data/raw/VisPub/keywords.json')


In [8]:
keyword_w_explanation = json.load(open(r'data/raw/VisPub/keywords_w_explanation.json'))
keywords = json.load(open(r'data/raw/VisPub/keywords.json'))
print(len(keyword_w_explanation), len(keywords))
res = {}
for k1, v1 in keyword_w_explanation.items():
    if k1 in keywords:
        res[k1] = v1
save_json(res, r'data/raw/VisPub/keywords_w_explanation_filtered.json')

1436 1434


In [9]:
len(res)

1434

In [16]:
articles = json.load(open(r'data/raw/VisPub/filtered_articles.json'))
article_dict = {article['id']: article for article in articles}
print(len(sorted_keywords), sorted_keywords)
for keyword, article_ids in sorted_keywords.items():
    for article_id in article_ids:
        if 'keyword' not in article_dict[article_id]: article_dict[article_id]['keyword'] = []
        article_dict[article_id]['keyword'].append(keyword)
save_json(list(article_dict.values()), r'data/raw/VisPub/articles_w_keywords.json')



1434 {'visual analytics': [4, 5, 27, 36, 37, 55, 56, 68, 96, 100, 102, 117, 138, 140, 149, 152, 154, 157, 175, 193, 197, 204, 226, 228, 243, 245, 246, 256, 261, 280, 290, 299, 301, 307, 308, 317, 326, 328, 331, 335, 350, 354, 364, 369, 372, 375, 386, 387, 402, 403, 406, 410, 420, 421, 435, 438, 439, 441, 443, 448, 464, 474, 483, 484, 486, 488, 489, 490, 493, 497, 511, 518, 519, 521, 522, 525, 528, 549, 555, 564, 582, 605, 612, 619, 620, 639, 642, 645, 650, 654, 655, 656, 662, 663, 664, 665, 666, 667, 668, 673, 678, 681, 688, 709, 724, 726, 728, 731, 740, 746, 753, 873, 874, 875, 885, 888, 891, 905, 909, 923, 929, 961, 977, 983, 989, 991, 1009, 1032, 1041, 1042, 1043, 1048, 1062, 1067, 1069, 1070, 1074, 1087, 1089, 1098, 1099, 1105, 1112, 1116, 1118, 1122, 1125, 1126, 1134, 1139, 1171, 1177, 1181, 1185, 1188, 1205, 1208, 1216, 1221, 1231, 1241, 1246, 1258, 1271, 1279, 1280, 1306, 1320, 1323, 1329, 1332, 1350, 1360, 1401, 1418, 1440, 1472, 1479, 1497, 1498, 1504, 1508, 1511, 1536, 1556, 

In [11]:
article_w_keywords = json.load(open('data/raw/VisPub/articles_w_keywords.json'))
count = 0
for article in article_w_keywords:
    # if article['AuthorKeywords'] == "" or article['Abstract'] == "":
    #     count += 1
    if 'keyword' not in article:
        count += 1
        print(article['id'], article['AuthorKeywords'])
print(count, len(article_w_keywords))

32 Data visualization literacy,children,constructionism,informal learning
34 pedagogy,final project,game interfaces
46 Color-concept association,colorization,EMD
47 Guidance Theory,Guidance Implementation
75 Interaction Recommendation,Visualization for public education,Mixed-initiative Exploration
92 High-dimensional filtering,multivariate filtering,output-sensitivity,multivariate attribute queries,progressive culling
98 Haptic Feedback,Human Centred Interaction,Robotic Arm
105 Pictorial visualization,data-driven design
124 accessible visualization,assistive technologies,alternative text for graphics
135 Visualization Tools,Visualization Recommendation Algorithms
136 Visual causal analysis,urban time series,causal graph analysis
156 Data comics,Non-linear narrative,interactive storytelling
159 Explanatory Visualisation,Administrative Justice,Law,Law Visualisation
183 Disease Progression,State Identification,Sequence Visualization
206 Analyzing Counterexamples,Hyperproperties,Multiple C