# BERT-fy CORD-19 data

Original code from https://www.kaggle.com/theamrzaki/covid-19-bert-researchpapers-semantic-search#Data-Processing

In [None]:
import glob
import json
import pandas as pd
from tqdm import tqdm

root_path = './../data/'
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)

len(all_json)

In [None]:
import os

# get only rows with attached files

metadata_path = f'{root_path}/metadata.csv'
stripped_metadata_path = f'{root_path}/stripped_metadata.csv'

if not os.path.exists(stripped_metadata_path):
    meta_df = pd.read_csv(metadata_path, dtype={
        'pubmed_id': str,
        'Microsoft Academic Paper ID': str, 
        'doi': str
    })

    stripped_meta_df = meta_df.dropna(subset=['pmc_json_files'])

    stripped_meta_df.to_csv(stripped_metadata_path)

    stripped_meta_df.head()

    del stripped_meta_df

In [None]:
import subprocess
import os

small_metadata_path = f'{root_path}/small_metadata.csv'
if not os.path.exists(small_metadata_path):
    small_metadata_file = open(f'{root_path}/small_metadata.csv', 'w')
    # get header from metadata.csv
    print(subprocess.run(['head', '-n 1', f'{root_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!head -n 1 "{root_path}/stripped_metadata.csv" > "{root_path}/small_metadata.csv"
    # get random sample from metadata.csv
    print(subprocess.run(['shuf', '-n 12500', f'{root_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!shuf -n 12500 "{root_path}/stripped_metadata.csv" >> "{root_path}/small_metadata.csv"
    small_metadata_file.close()

In [None]:
small_metadata_path = f'{root_path}/small_metadata.csv'

meta_df = pd.read_csv(small_metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})

meta_df.head()

In [None]:
import math

class Article:
    def __init__(self, pmcid):

        self.paper_id = ''
        self.abstract = []
        self.body_text = []

        if not isinstance(pmcid, str) and math.isnan(pmcid):
            return

        with open(f"{root_path}/document_parses/pmc_json/{pmcid}.xml.json") as file:
            content = json.load(file)
            content_metadata = meta_df.loc[meta_df['pmcid'] == pmcid]

            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.metadata = {}

            if not content_metadata is None:
                self.metadata = content_metadata

            if 'abstract' in content_metadata:
                # Abstract
                # self.abstract.append(content_metadata['abstract'][0])
                for entry in content_metadata['abstract']:
                    self.abstract.append(entry)
                # print(self.abstract)
            # Body text
            if 'body_text' in content:
                for entry in content['body_text']:
                    self.body_text.append(entry['text'])

            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

first_row = Article(meta_df['pmcid'][0])
print(first_row)
# meta_df.iloc[0]

In [None]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [None]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(meta_df['pmcid']):
    if idx % (len(meta_df) // 10) == 0:
        print(f'Processing index: {idx} of {len(meta_df)}')
    content = Article(entry)
    
    #print(content)

    # get metadata information
    meta_data = content.metadata
    #meta_df.loc[meta_df['pmcid'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue

    #print(meta_data)
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    # meta_data = meta_df.loc[meta_df['pmcid'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()