In [1]:
import json
import requests
import csv
import jsonlines
from flask import Flask, redirect, render_template, request, url_for
import sys
import numpy as np
from collections import defaultdict
import random
import math
import openai
import re
import pandas as pd
import os
from numpy.linalg import norm
import itertools
import string
import torch
from transformers import BertTokenizer, BertModel
import logging

#csv.field_size_limit(sys.maxsize)

In [2]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [3]:
openai.api_key = os.environ['openai_api_key']

In [4]:
data = csv.DictReader(open("../IEEE_papers/Raw_data/IEEE VIS papers 1990-2022 - Main dataset.csv"))

In [5]:
dataset = [datum for datum in data]
# dataset = dataset[0:20]

In [6]:
save_json(dataset, r'../IEEE_papers/processed_data/processed_data.json')

Checking frequency of keywords in Author keywords section of research papers

In [7]:
dat = json.load(open(r'../IEEE_papers/processed_data/processed_data.json'))
freq = {}
for datum in dat:
    keywords_raw = datum['AuthorKeywords'].strip()
    keywords = keywords_raw.split(',')
    for keys in keywords:
        keys.strip()
        if (keys in freq):
            freq[keys] += 1
        else:
            freq[keys] = 1

In [9]:
def call_gpt(messages, model="gpt-3.5-turbo-0613"):
    completions = openai.ChatCompletion.create(
        model=model,
        n=1,
        stop=None,
        temperature = 0.0,
        messages=messages)
    gpt_response = completions['choices'][0]['message']['content'].strip() 
    return gpt_response   

Prompt for extracting triggers from abstract

In [10]:
def extract_events(sentence):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a state of the art event extraction system. 
                Your task is to extract only the most important event that describes the main research idea from Research paper abstract.
                Strictly extract only one event. This event should be the major research focus of the abstract.
                The events should be human-readable. 
                Reply in JSON format with each line being an event in the format:
                [event];
                The abstract of research papers will be provided by the user.
            """
        },
        { "role": "system", "name": "example_user", "content": "The success of DL can be attributed to hours of parameter and architecture tuning by human experts. Neural Architecture Search (NAS) techniques aim to solve this problem by automating the search procedure for DNN architectures making it possible for non-experts to work with DNNs. Specifically, One-shot NAS techniques have recently gained popularity as they are known to reduce the search time for NAS techniques. One-Shot NAS works by training a large template network through parameter sharing which includes all the candidate NNs. This is followed by applying a procedure to rank its components through evaluating the possible candidate architectures chosen randomly. However, as these search models become increasingly powerful and diverse, they become harder to understand. Consequently, even though the search results work well, it is hard to identify search biases and control the search progression, hence a need for explainability and human-in-the-loop (HIL) One-Shot NAS. To alleviate these problems, we present NAS-Navigator, a visual analytics (VA) system aiming to solve three problems with One-Shot NAS; explainability, HIL design, and performance improvements compared to existing state-of-the-art (SOTA) techniques. NAS-Navigator gives full control of NAS back in the hands of the users while still keeping the perks of automated search, thus assisting non-expert users. Analysts can use their domain knowledge aided by cues from the interface to guide the search. Evaluation results confirm the performance of our improved One-Shot NAS algorithm is comparable to other SOTA techniques. While adding Visual Analytics (VA) using NAS-Navigator shows further improvements in search time and performance. We designed our interface in collaboration with several deep learning researchers and evaluated NAS-Navigator through a control experiment and expert interviews."},
        { "role": "system", "name": "example_system", "content": "[automating the search procedure for DNN architectures using Neural Architecture Search (NAS) techniques];"},         
        { "role": "user", "content": f"This is the research paper abstract:{sentence}"},
    ]
    events = call_gpt(messages)
    return events

In [11]:
def map_entities_abstract(word1, word2,abs):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a state of the art classification model. 
                You will be given a pair of 2 keywords and the research paper abstract containing the keywords as user input.
                Your task is to assign a broad and generic research topic that can represent both the words.
                The topic should not be too specific and should be less than 3 words.
                The topic should be very similar to the concept related to the given keywords.
                Strictly assign one topic to a pair of keywords.
                Remember what type was assigned to each keyword for future references.
                Multiple keywords can belong to one topic. 
                Reply in the format:
                research topic.
            """
        },
        # { "role": "system", "name": "example_user", "content": "[Theory , Theoretical and Empirical Research]"},
        # { "role": "system", "name": "example_system", "content": "[Theoretical Research - Theory , Theoretical and Empirical Research];"},    
        { "role": "user", "content": f"This is the first keyword:{word1}"},   
        { "role": "user", "content": f"This is the second keyword:{word2}"},
        { "role": "user", "content": f"This is the abstract:{abs}"}    
    ]
    events = call_gpt(messages)
    return events

In [12]:
def remove_keywords(keywords):
    messages = [
        { 
            "role": "system", 
            "content": """
                User will provide a list of keywords from research paper abstracts majorly talking about the same concept.
                Remove the keywords from the list that do not belong to the same research area or concept.
                Reply in the format:
                "key word 1, key word 2, ...";
            """
        },
        { "role": "system", "name": "example_user", "content": "[Data Storytelling,Deep Learning,ensemble learning,Tracking   Transformation,Motivated Perception]"},
        { "role": "system", "name": "example_system", "content": "Deep Learning,ensemble learning"},    
        { "role": "user", "content": f"This is the list of keywords:{keywords}"},   
        # { "role": "user", "content": f"This is the abstract:{abs}"}    
    ]
    events = call_gpt(messages)
    return events

Prompt used for openAI embeddings

In [13]:
def map_listEntities(keywords):
    messages = [
        { 
            "role": "system", 
            "content": """ 
                User will provide a list of keywords from research paper abstracts.
                The input will be in the format:
                key word 1, key word 2,...
                Which phrase would best describe the list of keywords.
                The phrase should be very specific and similar to the keywords and less than 5 words.
                If the words are too similar to each other, simply assign one of the words as the topic.
                Strictly assign one topic to a list of keywords.
                Reply in the format:
                research topic;
            """
        },
        { "role": "system", "name": "example_user", "content": "[dashboards,dashboard]"},
        { "role": "system", "name": "example_system", "content": "dashboards"},    
        { "role": "user", "content": f"This is the list of keyword:{keywords}"},      
    ]
    events = call_gpt(messages)
    return events

In [14]:
def map_pairEntities(word1, word2):
    messages = [
        { 
            "role": "system", 
            "content": """
                You are a state of the art classification model. 
                You will be given 2 keywords from a research paper abstract as user input.
                Your task is to assign a generic research topic that can represent both the words.
                The topic should not be too specific and should be less than 3 words.
                The topic should be majorly similar to the 2 given keywords.
                Strictly assign one topic to a pair of keywords.
                Remember what type was assigned to each keyword for future references.
                Multiple keywords can belong to one topic. 
                Reply in the format:
                research topic
            """
        },
        # { "role": "system", "name": "example_user", "content": "[Theory , Theoretical and Empirical Research]"},
        # { "role": "system", "name": "example_system", "content": "[Theoretical Research - Theory , Theoretical and Empirical Research];"},    
        { "role": "user", "content": f"This is the first keyword:{word1}"},   
        { "role": "user", "content": f"This is the second keyword:{word2}"},
        # { "role": "user", "content": f"This is the abstract:{abs}"}    
    ]
    events = call_gpt(messages)
    return events

In [15]:
def merge_sentences(datum_sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum_sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph

In [16]:
def strip_sentence(sentence):
    if sentence.startswith('The article discussed how'):
        stripped_sentence = sentence.replace('The article discussed how', '').strip()
        stripped_sentence = re.sub(",","",stripped_sentence)
    elif sentence.startswith('The article discussed'):
        stripped_sentence = sentence.replace('The article discussed', '').strip()
        stripped_sentence = re.sub(",","",stripped_sentence)
    else:
        print("!!!")
    return stripped_sentence

Extract Triggers using gpt api

In [18]:
dat = json.load(open(r'../IEEE_papers/processed_data/processed_data.json'))
res_events = []
error_datum = []
for index, datum in enumerate(dat):
    print('{}/{}'.format(index, len(dat)))
    sentence = datum['Abstract'].strip()
    sentence = re.sub("-"," ",sentence)
    cl = extract_events(sentence)
    datum['events'] = cl
    res_events.append(datum)
save_json(res_events, r'../IEEE_papers/Events/events_merged2.json')

0/3620


1/3620


KeyboardInterrupt: 

Mapping

In [46]:
dat = json.load(open(r'../IEEE_papers/processed_data/processed_data.json'))
res_events = []
error_datum = []
for index, datum in enumerate(keys):
    print('{}/{}'.format(index, len(keys)))
    sentence = datum['Abstract'].strip()
    keywords = datum['AuthorKeywords'].strip()
    sentence = re.sub("-"," ",sentence)
    cl = map_entities_abstract(keys)
    # datum['events'] = cl
res_events.append(cl)
save_json(res_events, r'../IEEE_papers/Events/mapped.json')

In [17]:
def replace_punctuation_with_whitespace(input_string):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    return input_string.translate(translator)

In [18]:
def get_top_n_values(dictionary, n):
    sorted_items = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
    top_n_items = sorted_items[:n]
    return dict(top_n_items)

In [19]:
def calculate_cosine(a,b):
    cosine = np.dot(a,b)/(norm(a)*norm(b))
    return cosine

Generating openAI embeddings for keywords

In [26]:
chars = json.load(open(r'../IEEE_papers/Result/main_participants.json'))
chars = chars[0:100]
flat_list = [item.strip() for sublist in chars for item in sublist]
flat_list = [replace_punctuation_with_whitespace(flat_list[i]) for i in range(len(flat_list))]
flat_list = [i.lower().strip() for i in flat_list]
flat_list = [re.sub(' +',' ',i) for i in flat_list]
while("" in flat_list):
    flat_list.remove("")
save_json(flat_list, r'../IEEE_papers/Result/flat_list.json')


In [21]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [56]:
openAI_embedds = {}
for i in flat_list:
    embed = get_embedding(i)
    openAI_embedds[i] = embed

save_json(openAI_embedds, r'../IEEE_papers/Result/openAI_embeddings_2.json')

photosensitivity
$$
photosensitive epilepsy
$$
accessibility
$$
federated learning
$$
data heterogeneity
$$
cluster analysis
$$
declarative specification
$$
self service data transformation
$$
programming by example
$$
aesthetic pleasure
$$
aesthetics
$$
validated scale
$$
explainability
$$
neural network architecture search
$$
deep learning
$$
theory
$$
theoretical and empirical research
$$
qualitative study
$$
confidence intervals
$$
bar charts
$$
uncertainty
$$
hierarchical tabular data
$$
tabular visualization
$$
tabular data
$$
music mood classification
$$
ensemble learning
$$
time series visualization
$$
kirigami
$$
physicalization
$$
aesthetics
$$
traces
$$
parallel computing
$$
event sequence visualization
$$
equity
$$
deficit thinking
$$
storytelling
$$
augmented merge tree
$$
scalar field visualization
$$
pixel based visualization
$$
gaussian mixture models
$$
ray casting
$$
scientific visualization
$$
beliefs
$$
cognition
$$
motivated perception
$$
mathematics
$$
physical en

Generate similar words mapping fop openAI embeddings

In [78]:
openAI_embedds = json.load(open(r'../IEEE_papers/Result/openAI_embeddings.json'))
cosine = json.load(open(r'../IEEE_papers/Result/cosine.json'))
similarity_dict = {}
check=[]
for i in range(len(flat_list)):
    similar = []
    res= []
    if (flat_list[i] not in check):
        similar.append(flat_list[i])
        check.append(flat_list[i])
        avg_score = 0.0
    for j in range(i+1,len(flat_list)):
        score = calculate_cosine(openAI_embedds[flat_list[i]],openAI_embedds[flat_list[j]])
        if (score>0.941 and flat_list[j] not in check or flat_list[i] not in check):
            similar.append(flat_list[j])
            check.append(flat_list[j])
            # print([similar,"^^^^^^^^^^^"])
            res = [calculate_cosine(openAI_embedds[a], openAI_embedds[b]) for idx, a in enumerate(similar) for b in similar[idx + 1:]]
            print([res,"@@@"])
            avg_score = np.mean(res,dtype=np.float64)
            if(avg_score<0.941):
                print([flat_list[j],"()()()()()"])
                similar.remove(flat_list[j])
            # print(similar)

    similar = list(set(similar))
    similar = ",".join(similar)
    check = list(set(check))
    new_list = list(similar.split(","))
    if(len(new_list)>1):
            mapping = map_listEntities(new_list)
            similarity_dict[similar] = mapping.lower()
save_json(similarity_dict, r'../IEEE_papers/Result/similarity_dict_2.json')

[['declarative specification', 'declarative grammar'], '^^^^^^^^^^^']
[[0.9453816242137059], '@@@']
0.9453816242137059
['declarative specification', 'declarative grammar']
$$$
[['hierarchical tabular data', 'tabular data'], '^^^^^^^^^^^']
[[0.94971870809219], '@@@']
0.94971870809219
['hierarchical tabular data', 'tabular data']
$$$
[['time series visualization', 'multivariate time series'], '^^^^^^^^^^^']
[[0.9570334227312306], '@@@']
0.9570334227312306
['time series visualization', 'multivariate time series']
$$$
[['storytelling', 'visual storytelling'], '^^^^^^^^^^^']
[[0.944345009798009], '@@@']
0.944345009798009
['storytelling', 'visual storytelling']
$$$
[['sports visualization', 'sports analytics'], '^^^^^^^^^^^']
[[0.953217904653702], '@@@']
0.953217904653702
['sports visualization', 'sports analytics']
$$$
[['data storytelling', 'data driven storytelling'], '^^^^^^^^^^^']
[[0.9672197344399888], '@@@']
0.9672197344399888
['data storytelling', 'data driven storytelling']
$$$
[['u

BERT Embeddings

In [50]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
mp = json.load(open(r'../IEEE_papers/Result/main_participants.json'))
mp=mp[0:100]
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True)
embedds = {}
serialized_dict = {}
for dat in mp:
    for i in dat:
        input_text =  re.sub("\s{2,}", " ", i)
        input_text = input_text.lower()
        print(input_text)
        # print(type(i))
        marked_text = "[CLS] " + input_text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensors)
            hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_vecs = hidden_states[-2][0]
        sentence_embedding = torch.mean(token_vecs, dim=0)
        # sentence_embedding = sentence_embedding.tolist()
        embed_list = list(sentence_embedding)
        # print(type(embed_list))
        embedds[input_text]=embed_list
for key, value in embedds.items():
    ar = np.array(value)
    serialized_dict[key] = ar.tolist()
# embedds = [len(t) for t in embedds]
save_json(serialized_dict, r'../IEEE_papers/Result/embed.json')

photosensitivity
photosensitive epilepsy
accessibility
federated learning
data heterogeneity
cluster analysis
declarative specification
self service data transformation
programming by example
aesthetic pleasure
aesthetics
validated scale
explainability
neural network architecture search
deep learning
theory
theoretical and empirical research
qualitative study
confidence intervals
bar charts
uncertainty
hierarchical tabular data
tabular visualization
tabular data
music mood classification
ensemble learning
time series visualization
kirigami
physicalization
aesthetics
traces
parallel computing
event sequence visualization
equity
deficit thinking
storytelling
augmented merge tree
scalar field visualization
pixel based visualization
gaussian mixture models
ray casting
scientific visualization
beliefs
cognition
motivated perception
mathematics
physical environmental sciences
tracking transformation
dendrograms
cyber physical networks
human centered computing
molecular dynamics
progressive a

Calculating tfdif scores for all author defined keywords.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string 
import itertools

dat = json.load(open(r'../IEEE_papers/processed_data/processed_data.json'))
embedds = json.load(open(r'../IEEE_papers/Result/embed.json'))
abstracts = []
keywords = []
scores = {}
for index,datum in enumerate(dat):
    abstract = datum['Abstract'].strip()
    # print(abstract)
    keyword= datum['AuthorKeywords'].strip()
    keyword = keyword.replace("-"," ")
    abstracts.append(abstract)
    keywords.append(keyword)
# save_json(abstracts, r'../IEEE_papers/Result/abs.json')
save_json(keywords, r'../IEEE_papers/Result/keys.json')


tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(abstracts)
scores = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

keyword_scores = {}
for datum in dat:
    abstract = datum['Abstract'].strip()
    keywords= datum['AuthorKeywords'].split(',')
    keys_score = {} 
    for keys in keywords:
        keys = replace_punctuation_with_whitespace(keys)
        key = keys.split()
        score=0.0
        cnt=0
        for k in key:
            k = k.lower()
            if k in scores.keys():
                score += scores[k]
                cnt+=1
            else:
                score = 10
                cnt+=1
        if(cnt!=0) :
            score = score/cnt
        keys_score[keys] = score
    keyword_scores[abstract] = keys_score
test = dict(itertools.islice(keyword_scores.items(), 10))
save_json(keyword_scores, r'../IEEE_papers/Result/final_dict.json')



Weighted average using tdidf score, only required when using BERT embeddings.

In [22]:
dat = dat[0:20]
temp=""
for datum in dat:
    abstract = datum['Abstract'].strip()
    keywords= datum['AuthorKeywords'].split(',')
    for keys in keywords:
        count = 0
        for k in key:
            count+=1
            temp = " ".join(k)
            k = k.lower()
            if k in scores.keys():
                score += scores[k]
                cnt+=1
            else:
                score = 10
                cnt+=1
        if(temp):
            if(cnt!=0) :
                score = score/cnt
                embedds[temp]=embedds[temp]*score
                embedds[temp]=np.mean(embedds[temp], axis=0)


Cosine score dictionary and getting top 'n' main participants using tfidf score.

In [27]:
chars = json.load(open(r'../IEEE_papers/Result/main_participants.json'))
abs = json.load(open(r'../IEEE_papers/Result/abs.json'))
embedds = json.load(open(r'../IEEE_papers/Result/embed.json'))
chars = chars[0:100]
top_n_values_per_key = {}
cosine_scores = {}
main_participants = []
main = []
for key, sub_dict in keyword_scores.items():
    top_n_values = get_top_n_values(sub_dict, 3) # change number to change required main participants 
    main_participants.append(list(top_n_values.keys()))
    top_n_values_per_key[key] = top_n_values
for i in range(len(flat_list)):
    for j in range(i+1,len(flat_list)):
        rx = r'(?<=\b[^\W\d_])\s(?=[^\W\d_]\b)'
        A = flat_list[i].casefold()
        A=re.sub(r'\s+', ' ', A)
        B = flat_list[j].casefold()
        B=re.sub(r'\s+', ' ', B)
        tup = ",".join([A,B])
        cosine = calculate_cosine(embedds[A],embedds[B])
        cosine_scores[tup] = cosine
for i,datum in enumerate(chars):
    res = [(a, b) for idx, a in enumerate(datum) for b in datum[idx + 1:]]
    for index, word in enumerate(res):
        if(index<len(datum)):
            A = "".join(word[0])
            A=A.lower().strip()
            B = "".join(word[1])
            B=B.lower().strip()
            A=re.sub(r'\s+', ' ', A)
            B=re.sub(r'\s+', ' ', B)
            tupp = ",".join([A,B])
            cosine_sc = cosine_scores[tupp]
            if(cosine_sc>0.821):
                abst = abstracts[i]
                ent_type = map_pairEntities(word[0],word[1])
                if(word[0] in datum):
                    datum.remove(word[0])
                if(word[1] in datum):
                    datum.remove(word[1])
                datum.append(ent_type)
    main.append(list(datum))
save_json(main, r'../IEEE_papers/Result/test2.json')
save_json(cosine_scores, r'../IEEE_papers/Result/cosine.json')

KeyError: 'electronic health record ehr'

Generate similar words mapping using BERT Embeddings

In [28]:
embedds = json.load(open(r'../IEEE_papers/Result/embed.json'))
cosine_scores = json.load(open(r'../IEEE_papers/Result/cosine.json'))
cosine_dict = {}
check = []
for i,datum in enumerate(flat_list):
        similar = []
        if (flat_list[i] not in check):
                similar.append(flat_list[i])
                check.append(flat_list[i])
        # print(flat_list[i])
        for j in range(i+1,len(flat_list)):
                # print(type(flat_list[i]))
                # print(flat_list[j])
                tup = ','.join([flat_list[i],flat_list[j]])
                # print(type(tup))
                score = cosine_scores[tup]
                if (score>0.83 and flat_list[j] not in check or flat_list[i] not in check):
                                # print("case 1")
                                similar.append(flat_list[j])
                                check.append(flat_list[j])
                                # print("case 2")
                                # similar.append(flat_list[i])
                                # check.append(flat_list[i])
                                # # print("case 3")
                                # similar.append(flat_list[j])
                                # check.append(flat_list[j])
        similar = list(set(similar))
        check = list(set(check))
        print(similar)
        print("$$$")
        post_removed = remove_keywords(similar)
        post_removed.replace(";","")
        new_list = list(post_removed.split(","))

        if(len(new_list)>1):
                mapping = map_listEntities(new_list)
                cosine_dict[post_removed] = mapping.lower()
save_json(cosine_dict, r'../IEEE_papers/Result/map_dict.json')

KeyError: 'photosensitivity,openalex'

Function to get final event graph

In [None]:
from string import punctuation
def post_process_events(dataset):
    for index, datum in enumerate(dataset):
        events = []
        main_characters = []
        datum['doc_id'] = index
        # datum['events_raw'] = datum['events']
        events_str = datum['events'].split('\n')
        for chars in main_participants[index+1]:
            print(chars)
            main_characters.append(chars)
        for event_str in events_str:
            arguments=[]
            event_str = event_str.strip()
            components = event_str.split(':')
            event_type = components[0].strip()
            events.append({'Trigger':event_type, 'Main Participants': main_characters})
        datum['events'] = events
    return dataset

dataset = json.load(open(r'../IEEE_papers/Events/events_merged.json'))
processed_dataset = post_process_events(dataset)
save_json(processed_dataset, r'../IEEE_papers/Result/final_participants.json')


photosensitivity
photosensitive epilepsy
accessibility
Federated learning
data heterogeneity
cluster analysis
declarative specification
self service data transformation
programming by example
aesthetic pleasure
Aesthetics
validated scale
Explainability
Neural Network Architecture Search
Deep Learning
Theory
Theoretical and Empirical Research
Qualitative Study
Confidence intervals
Bar charts
Uncertainty
hierarchical tabular data
tabular visualization
tabular data
music mood classification
ensemble learning
time series visualization
kirigami
physicalization
aesthetics
traces
parallel computing
event sequence visualization
Equity
Deficit Thinking
Storytelling
augmented merge tree
Scalar field visualization
pixel based visualization
Gaussian mixture models
ray casting
Scientific visualization
Beliefs
Cognition
Motivated Perception
Mathematics
Physical   Environmental Sciences
Tracking   Transformation
Dendrograms
Cyber physical networks
Human centered computing
Molecular dynamics
progressi