In [263]:
import numpy as np
import pandas as pd 
import json
import os
import pickle
import time
import copy
from tqdm import tqdm_notebook
from random import randint
from collections import Counter

import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

In [27]:
def read_json_data(data_list, input_path): 
    '''
    Inputs: 
        - data_list: json file paths
        - input_path: input_path
        
    Output:
        - dataframe containing: 
              'paper_id', 
              'titles', 
              'abstracts', 
              'introductions', 
              'conclusions', 
              'full_bodytext', 
              'bodysections',
              'body_text_citations', 
              'context_title_list', 
              'cite_start', 
              'cite_end', 
              'cite_mark'
    This function is used to parse json files to return the output elements
    '''
    
    bibentries_title = []
    bibentries_token = []
    for json_file in range(0, len(data_list)):
        bibentries_token.append(list(data_list[json_file]['bib_entries'].keys()))

    for token_num, token_list in enumerate(bibentries_token):
        bibentry_title = []
        for token_len, token in enumerate(token_list):
            bibentry_title.append(data_list[token_num]['bib_entries'][token]['title'])
        bibentries_title.append(bibentry_title)
        
    titles = []
    all_info = []
    paper_id = []
    search_abstracts = []
    for json_file in range(0, len(data_list)):
        paper_id.append(data_list[json_file]['paper_id'])
        titles.append(data_list[json_file]['metadata']['title'])
        all_info.append(data_list[json_file]['body_text'])
        try:
            search_abstracts.append(data_list[json_file]['abstract'])
        except IndexError:
            search_abstracts.append(None)
        except KeyError:
            search_abstracts.append(None)

    abstracts = []
    for texts in search_abstracts:
        local_abstract = []
        if texts is not None:
            for num in range(0, len(texts)):
                local_abstract.append(texts[num]['text'])
        abstracts.append(' '.join(local_abstract))

    bodysections = []
    full_bodytext = []
    introductions = []
    conclusions = []
    cite_tokens = []
    cite_start = []
    cite_end = []
    cite_mark = []
    
    for text_info in all_info:
        local_info = []
        local_cite_token = []
        local_cite_start = []
        local_cite_end = []
        local_cite_mark = []
        local_introduction = []
        local_conclusion = []

        for info_len in range(0, len(text_info)):
            if text_info[info_len]['section'] == 'Introduction':
                local_introduction.append(text_info[info_len]['text'])
            elif text_info[info_len]['section'] == 'Conclusion':
                local_conclusion.append(text_info[info_len]['text'])
            local_info.append(text_info[info_len]['text'])
        for indices in text_info:
            for cite_spans in indices['cite_spans']:
                local_cite_token.append(cite_spans['ref_id'])
                local_cite_start.append(cite_spans['start'])
                local_cite_end.append(cite_spans['end'])
                try:
                    local_cite_mark.append(cite_spans['text'])
                except KeyError:
                    local_cite_mark.append(None)
        introductions.append(''.join(local_introduction))
        conclusions.append(''.join(local_conclusion))
        full_bodytext.append(' '.join(local_info))
        bodysections.append(local_info)
        cite_tokens.append(local_cite_token)
        cite_start.append(local_cite_start)
        cite_end.append(local_cite_end)
        cite_mark.append(local_cite_mark)

    bib_dict_list = []
    for bib_ref, bib_ttl in (zip(bibentries_token, bibentries_title)):
        bib_dict = {}
        for bib_bib_ref, bib_bib_ttl in zip(bib_ref, bib_ttl):
            bib_dict[bib_bib_ref] = bib_bib_ttl
        bib_dict_list.append(bib_dict)

    context_title_list = []
    for cite_val, bib_val in (zip(cite_tokens, bib_dict_list)):
        cite_set = cite_val
        bib_set = set(bib_val)
        context_title_temp = []
        for value in cite_set:
            for val in bib_set:
                if value == val:
                    context_title_temp.append(bib_val[value])
                elif value == None:
                    context_title_temp.append(None)
                    break
        context_title_list.append(context_title_temp)
        
    
    fields = {
              'paper_id': paper_id[0], 
              'titles': titles[0], 
              'abstracts': abstracts[0], 
              'introductions': introductions[0], 
              'conclusions': conclusions[0], 
              'full_bodytext': full_bodytext[0], 
              'bodysections': bodysections[0],
              'context_title_list': context_title_list[0], 
              'cite_start': cite_start[0], 
              'cite_end': cite_end[0], 
              'cite_mark': cite_mark[0]
            }
    return fields

In [52]:
flat_context_title_list = []
for i in cord_file['context_title_list']:
    for j in i:
        flat_context_title_list.append(j)

print(len(set(flat_context_title_list)))  # todo: scrape pubmed for full articles

877120


In [117]:
rule_based_titles = []
rule_based_abstracts = []
rule_based_conclusions = []

wh_titles = []
q_titles = []
do_does_titles = []

wh_abstracts = []
q_abstracts = []
do_does_abstracts = []

wh_conclusions = []
q_conclusions = []
do_does_conclusions = []

for i, j, k in tqdm_notebook(zip(cord_file['titles'], cord_file['conclusions'], cord_file['abstracts'])):
    if i.startswith('Who') or i.startswith('What') or i.startswith('Where') or i.startswith('When') or i.startswith('Why') or i.startswith('How') or i.startswith('Can ') or i.startswith('Could '):
        wh_titles.append(i)
        wh_abstracts.append(k)
        wh_conclusions.append(j)
        
    if i.endswith('?'):
        q_titles.append(i)
        q_abstracts.append(k)
        q_conclusions.append(j)
        
    if i.startswith('Do ') or i.startswith('Does '):
        do_does_titles.append(i)
        do_does_abstracts.append(k)
        do_does_conclusions.append(j)

rule_based_titles.extend(wh_titles)
rule_based_titles.extend(do_does_titles)
rule_based_titles.extend(q_titles)

rule_based_abstracts.extend(wh_abstracts)
rule_based_abstracts.extend(do_does_abstracts)
rule_based_abstracts.extend(q_abstracts)

rule_based_conclusions.extend(wh_conclusions)
rule_based_conclusions.extend(do_does_conclusions)
rule_based_conclusions.extend(q_conclusions)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [80]:
rule_based_context_titles = []
context_wh_titles = []
context_q_titles = []
context_do_does_titles = []
for i in tqdm_notebook(list(set(flat_context_title_list))):
    try:
        if i.startswith('Who') or i.startswith('What') or i.startswith('Where') or i.startswith('When') or i.startswith('Why') or i.startswith('How'):
            context_wh_titles.append(i)

        if i.endswith('?'):
            context_q_titles.append(i)

        if i.startswith('Do ') or i.startswith('Does '):
            context_do_does_titles.append(i)
    except AttributeError:
        continue

rule_based_context_titles.extend(context_wh_titles)
rule_based_context_titles.extend(context_q_titles)
rule_based_context_titles.extend(context_do_does_titles)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=877120.0), HTML(value='')))




In [68]:
str_list = list(filter(None, rule_based_conclusions))
print(len(set(str_list)))

str_list = list(filter(None, rule_based_abstracts))
print(len(set(str_list)))


94
380


In [118]:
print(len(set(rule_based_titles)))

553


In [113]:
sme_data = pd.DataFrame()
sme_data['questions'] = rule_based_titles
sme_data['abstracts'] = rule_based_abstracts
sme_data['conclusions'] = rule_based_conclusions

sme_data_cite = pd.DataFrame()
sme_data_cite['questions'] = list(set(rule_based_context_titles))

# POS Tagging

In [129]:
from nltk.tag import StanfordPOSTagger
stanford_dir = '/stanford-postagger-2018-10-16/'
modelfile = stanford_dir + 'models/english-bidirectional-distsim.tagger'
jarfile = stanford_dir + 'stanford-postagger-3.9.2.jar'

st = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile)

In [309]:
non_rule_based_titles = list(set(cord_file['titles']).difference(set(rule_based_titles)))

In [171]:
tagged_list = []
for sent in tqdm_notebook(non_rule_based_titles):
    try:
        tagged_list.append(st.tag(sent.split()))
    except:
        tagged_list.append('')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=28204.0), HTML(value='')))

Loading default properties from tagger /h/ /stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger
Loading POS tagger from /h/ /stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger ... done [0.6 sec].
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
	at edu.stanford.nlp.sequences.ExactBestSequenceFinder.bestSequence(ExactBestSequenceFinder.java:129)
	at edu.stanford.nlp.sequences.ExactBestSequenceFinder.bestSequence(ExactBestSequenceFinder.java:37)
	at edu.stanford.nlp.tagger.maxent.TestSentence.runTagInference(TestSentence.java:341)
	at edu.stanford.nlp.tagger.maxent.TestSentence.testTagInference(TestSentence.java:328)
	at edu.stanford.nlp.tagger.maxent.TestSentence.tagSentence(TestSentence.java:151)
	at edu.stanford.nlp.tagger.maxent.MaxentTagger.tagSentence(MaxentTagger.java:1052)
	at edu.stanford.nlp.tagger.maxent.MaxentTagger.tagCoreLabelsOrHasWords(MaxentTagger.java:1843)
	at edu.stanford.nlp.tagger.maxent.MaxentTagger

# Find NP-(VBP/VBZ) structured statement titles and convert by adding copulas ("is", "are") or auxiliary verbs ("does", "do")

### generate "yes"/"no" answer according to negation status of VB

In [229]:
pos_lengths = []
for i in pos_tagged:
    pos_lengths.append(len(i))

In [244]:
entire_pos = []
for val, length in zip(pos_tagged, pos_lengths):
    local_pos = []
    if length > 0:
        for lengths in range(0, length):
            local_pos.append(val[lengths][1])
    entire_pos.append(local_pos)

In [336]:
pos_df = pd.DataFrame()
pos_df['pos_tags'] = pos_tagged
pos_df['titles'] = non_rule_based_titles
pos_df['pos_vals'] = entire_pos
pos_df['pos_lengths'] = pos_lengths

In [337]:
merged_df = pos_df.merge(cord_file, how = 'inner', on = 'titles')

In [339]:
print(merged_df.columns.values)

['pos_tags' 'titles' 'pos_vals' 'pos_lengths' 'paper_id' 'abstracts'
 'introductions' 'conclusions' 'full_bodytext' 'bodysections'
 'context_title_list' 'cite_start' 'cite_end' 'cite_mark']


In [355]:
merged_df = merged_df[merged_df.pos_lengths != 0].drop_duplicates(subset = 'titles' ,keep=False)

In [357]:
df_artificial = merged_df[['titles', 'pos_vals', 'pos_lengths', 'abstracts', 'conclusions']]

In [358]:
flat_pos = []
for pos in df_artificial['pos_vals']:
    for val in pos:
        flat_pos.append(val)
print(Counter(flat_pos))

Counter({'NN': 116747, 'JJ': 56875, 'IN': 56712, 'NNP': 48998, 'NNS': 30362, 'DT': 18705, 'CC': 15581, 'FW': 9601, 'CD': 6044, 'VBG': 5904, 'VBN': 4424, 'VBZ': 3596, 'TO': 3158, 'NNPS': 2234, 'VB': 1621, 'RB': 1554, 'VBP': 1227, 'VBD': 683, 'PRP$': 677, 'SYM': 401, 'WDT': 202, 'JJR': 191, 'MD': 150, ':': 142, 'PRP': 116, 'WRB': 90, ',': 86, 'WP': 86, '.': 45, 'LS': 43, 'RBR': 33, 'JJS': 30, 'RP': 14, '#': 11, '$': 11, 'UH': 9, "''": 7, 'RBS': 5, 'EX': 4, 'WP$': 2, 'PDT': 2, 'POS': 2, '``': 1})


In [359]:
vb_tags = []
vb_titles = []
for tag, val in zip(df_artificial['pos_vals'], df_artificial['titles']):
    if ('VBG' or 'VBP' or 'VBZ' or 'VBD' or 'VBN') in tag:
        vb_tags.append(tag)
        vb_titles.append(val)

In [362]:
import random

# improve with better set of rules

prepend_list = ['Do', 'Does', 'What', 'Where', 'When', 'Why', 'How', 'Can', 'Could']

lower_case = lambda s: s[:1].lower() + s[1:] if s else ''

vb_questions = []
for title in vb_titles:
    vb_questions.append(random.choice(prepend_list) + ' ' + lower_case(title) + '?')

In [365]:
vb_df = pd.DataFrame()
vb_df['questions'] = vb_questions
vb_df['titles'] = vb_titles

In [366]:
df_artificial_final = df_artificial.merge(vb_df, how = 'inner', on = 'titles')

In [368]:
df_artificial_final = df_artificial_final[['titles', 'questions', 'abstracts', 'conclusions']]

In [301]:
rand_num = randint(0, len(vb_questions))
for question in vb_questions[rand_num: rand_num+20]:
    print(question)
    print('---')

Where pandemic H1N1 in Canada and the use of evidence in developing public health policies e A policy analysis?
---
When characterizing the Transmission Potential of Zoonotic Infections from Minor Outbreaks?
---
What open Access A quality assessment of genetic association studies supporting susceptibility and outcome in acute lung injury?
---
Does health behavior education, e-research and a (H1N1) influenza (Swine Flu): bridging the gap between intentions and health behavior change?
---
Do distinguishing Molecular Features and Clinical Characteristics of a Putative New Rhinovirus Species, Human Rhinovirus C (HRV C)?
---
Could supporting on-line material?
---
What development of a duplex real-time RT-qPCR assay to monitor genome replication, gene expression and gene insert stability during in vivo replication of a prototype live attenuated canine distemper virus vector encoding SIV gag?
---
Can connectivity analyses of bioenergetic changes in schizophrenia: Identification of novel treat

In [303]:
vb_df = pd.DataFrame()
vb_df['questions'] = vb_questions