In [1]:
import os
import json
import pandas as pd
import pprint
pp = pprint.PrettyPrinter(indent=2)
from collections import Counter
import re
import numpy as np

In [2]:
dir_data_raw = os.path.join("..", "data", "raw")
data_dir_interim = os.path.join("..", "data", "interim")
datasets = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'pmc_custom_license']

## Formatting
These formatting helper functions are courtesy of [xhlulu](https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv)

In [12]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [13]:
metadata = pd.read_csv(os.path.join(dir_data_raw, "all_sources_metadata_2020-03-13.csv"))
metadata["full_text"] = ""
metadata["file_path"] = None
metadata["results"] = ""
metadata["conclusion"] = ""

In [14]:
def parse_article(full_path, file_path):
    with open(full_path) as file:
        json_article = json.load(file)["body_text"]
        article_sections = []
        metadata.loc[index, 'full_text'] = format_body(json_article)
        for body_text in json_article:
            section_heading = re.sub(r'[^a-zA-Z0-9 ]', '', body_text["section"]).lower().strip()
            for section, headings in section_headings.items():
                metadata.loc[index, 'full_text'] = article["full_text"] + body_text["text"]
                if section_heading in headings:
                    metadata.loc[index, section] =  article[section] + body_text["text"]

In [15]:
section_headings = {
    "results": ["results and discussion", "results"],
    "conclusion": ["conclusion", "conclusions", "discussion and conclusions"]
}

for index, article in metadata.iterrows():
    # We only need to update if there's a full text
    if article["has_full_text"]:
        for dataset in datasets:
            file_path = os.path.join(dataset, dataset, str(article["sha"]) + ".json")
            metadata.loc[index, "file_path"] = file_path
            full_path = os.path.join(dir_data_raw, file_path)
            if os.path.exists(full_path):
                parse_article(full_path, file_path)

In [16]:
metadata.to_csv(os.path.join(data_dir_interim, "1_full_data.csv"))