Some exploration and preprocessing to create one dataframe and export it as .csv.

Adjusted to data from 2020-05-01.

# Load Packages

In [None]:
import numpy as np 
import pandas as pd

import glob
import json

# Load and Prepare Data

To read the JSON files we follow [COVID EDA: Initial Exploration Tool](https://www.kaggle.com/ivanegapratama/covid-eda-initial-exploration-tool).

In [None]:
root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head(2)

In [None]:
len(meta_df) - meta_df.has_pdf_parse.sum()

In [None]:
meta_df.isnull().sum()

In [None]:
meta_df.cord_uid.nunique()

In [None]:
meta_df.sha.nunique()

In [None]:
meta_df.title.nunique()

In [None]:
all_json = glob.glob(f'{root_path}/**/pdf_json/*.json', recursive=True)
len(all_json)

In [None]:
all_json_pmc = glob.glob(f'{root_path}/**/pmc_json/*.json', recursive=True)
len(all_json_pmc)

# pdf_json

In [None]:
methods = ['methods','method','statistical methods','materials','materials and methods',
                'data collection','the study','study design','experimental design','objective',
                'objectives','procedures','data collection and analysis', 'methodology',
                'material and methods','the model','experimental procedures','main text']

In [None]:
# [''.join(x.lower() for x in m if x.isalpha()) for m in methods]

# for m in methods:
#     print(''.join(x.lower() for x in m if x.isalpha()))

In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.methods = []
            self.results = []

            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            # Methods
            methods = ['methods','method','statistical methods','materials','materials and methods',
                'data collection','the study','study design','experimental design','objective',
                'objectives','procedures','data collection and analysis', 'methodology',
                'material and methods','the model','experimental procedures','main text']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha()) #remove numbers and spaces
                if any(m in section_title for m in [''.join(x.lower() for x in m if x.isalpha()) for m in methods]) : 
                    self.methods.append(entry['text'])
            # Results
            results_synonyms = ['result']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha())
                if any(r in section_title for r in results_synonyms) :
                    self.results.append(entry['text'])
                    
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
            self.methods = '\n'.join(self.methods)
            self.results = '\n'.join(self.results)

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

In [None]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'methods': [], 'results': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    dict_['methods'].append(content.methods)
    dict_['results'].append(content.results)

In [None]:
papers = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'methods', 'results'])
papers.head()

In [None]:
papers[(papers.results.str.len() != 0) | (papers.methods.str.len() != 0)].shape

In [None]:
df = pd.merge(papers, meta_df, left_on='paper_id', right_on='sha', how='left').drop('sha', axis=1)

In [None]:
df.columns

# pmc_json

This only contains the full text - no abstracts!

In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.body_text = []
            self.methods = []
            self.results = []

            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            # Methods
            methods = ['methods','method','statistical methods','materials','materials and methods',
                'data collection','the study','study design','experimental design','objective',
                'objectives','procedures','data collection and analysis', 'methodology',
                'material and methods','the model','experimental procedures','main text']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha()) #remove numbers and spaces
                if any(m in section_title for m in [''.join(x.lower() for x in m if x.isalpha()) for m in methods]) : 
                    self.methods.append(entry['text'])
            # Results
            results_synonyms = ['result']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha())
                if any(r in section_title for r in results_synonyms) :
                    self.results.append(entry['text'])
                    
            self.body_text = '\n'.join(self.body_text)
            self.methods = '\n'.join(self.methods)
            self.results = '\n'.join(self.results)

    def __repr__(self):
        return f'{self.paper_id}: {self.body_text[:200]}...'
first_row = FileReader(all_json_pmc[0])
print(first_row)

In [None]:
dict_ = {'paper_id': [], 'body_text': [], 'methods': [], 'results': []}
for idx, entry in enumerate(all_json_pmc):
    if idx % (len(all_json_pmc) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json_pmc)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    dict_['methods'].append(content.methods)
    dict_['results'].append(content.results)

In [None]:
pmc_text = pd.DataFrame(dict_, columns=['paper_id', 'body_text', 'methods', 'results'])
pmc_text.head()

In [None]:
pmc_text.shape

Careful, some of the new texts are empty strings!

In [None]:
pmc_text[pmc_text.body_text == '']

In [None]:
pmc_text = pmc_text[pmc_text.body_text != '']

In [None]:
pmc_text.shape

In [None]:
df.head()

In [None]:
df = pd.merge(df, pmc_text, left_on='pmcid', right_on='paper_id', how='left').drop('paper_id_y', axis=1)

In [None]:
df.head(3)

In [None]:
df.columns

In [None]:
df.drop(columns=['has_pdf_parse', 'has_pmc_xml_parse', 'Microsoft Academic Paper ID', 'WHO #Covidence'], inplace=True)

In [None]:
df.head(2)

# Exploration/Cleaning

### Different Abstract in Metadata and JSON files

abstract_x from json, abstract_y from metadata

In [None]:
df[df.abstract_x != df.abstract_y].shape

In [None]:
df[df.abstract_x != df.abstract_y][['abstract_x', 'abstract_y', 'url']].tail(10)

In [None]:
df[df.abstract_x != df.abstract_y][['abstract_x', 'abstract_y', 'url']].url.iloc[-1]

Checking some of the files online, it seems that where the abstract is missing in the metadata, the abstract in the JSON file is simply the beginning of the text.

In [None]:
df[df.abstract_x != df.abstract_y][['abstract_x', 'abstract_y', 'url', 'body_text_x', 'body_text_y']][
    (df.abstract_y.isnull()) & (df.abstract_x != '') & (~df.url.isnull())]

body_text_x is from pdf, body_text_y from pmc

In [None]:
df.shape

In [None]:
df.abstract_x.isnull().sum(), (df.abstract_x =='').sum() # missing abstracts in json files

In [None]:
df.abstract_y.isnull().sum(), (df.abstract_y=='').sum() # missing abstracts in metadata

Since the abstracts from the metadata seem more reliable we generally use these, but fill the missing values with the abstract from the extracted values from the JSON file.

In [None]:
df.loc[df.abstract_y.isnull() & (df.abstract_x != ''), 'abstract_y'] = df[(df.abstract_y.isnull()) & (df.abstract_x != '')].abstract_x

In [None]:
df.abstract_y.isnull().sum()

the remaining missing values are also empty in the json files

In [None]:
(df.abstract_y.isnull() & (df.abstract_x!='')).sum()

In [None]:
df.rename(columns = {'abstract_y': 'abstract'}, inplace=True)
df.drop('abstract_x', axis=1, inplace=True)

In [None]:
df.columns

We still have to compare the text body from pdf and pmc files.

In [None]:
df.shape

# Quick comparison of both texts

In [None]:
df.shape

In [None]:
(df.body_text_x != df.body_text_y).sum()

In [None]:
df[(df.body_text_x != df.body_text_y) & df.body_text_y.notnull()][['body_text_x', 'body_text_y']].head(10).iloc[2].values[0][:500]

In [None]:
df[(df.body_text_x != df.body_text_y) & df.body_text_y.notnull()][['body_text_x', 'body_text_y']].head(10).iloc[2].values[1][:500]

In [None]:
df[df.body_text_x != df.body_text_y].head()

In [None]:
df.iloc[34885].body_text_x[:500]

In [None]:
df.iloc[34885].body_text_y[:500]

In [None]:
df.iloc[34885].url

In [None]:
df.iloc[34888].body_text_x[:500]

In [None]:
df.iloc[34888].body_text_y[:500]

In [None]:
df.iloc[34888].url

In [None]:
df.iloc[1337].body_text_x[:500]

In [None]:
df.iloc[1337].body_text_y[:500]

In [None]:
df.iloc[1337].url

In [None]:
df.iloc[1242].body_text_x[:500]

In [None]:
df.iloc[1242].body_text_y[:500]

In [None]:
df.iloc[1242].url

Where available we use the text from the pmc file (body_text_y), trusting the statement that it is of higher quality.

In [None]:
df.body_text_x.isnull().sum(), df.body_text_y.isnull().sum()

In [None]:
(df.body_text_x == '').sum(), (df.body_text_y == '').sum()

In [None]:
df.loc[df.body_text_y.notnull(), 'body_text_x'] = df.loc[df.body_text_y.notnull(), 'body_text_y']

In [None]:
df.body_text_x.isnull().sum()

In [None]:
df.rename(columns = {'body_text_x': 'body_text'}, inplace=True)
df.drop('body_text_y', axis=1, inplace=True)

In [None]:
df.columns

In [None]:
df[['methods_x', 'methods_y', 'url']][df.methods_y.notnull()]

In [None]:
(df.methods_x == '').sum(), df.methods_x.isnull().sum()

In [None]:
(df.methods_y == '').sum(), df.methods_y.isnull().sum()

In [None]:
# use methods_y (from pmc) when it's available
mask = (df.methods_y.notnull()) & (df.methods_y != '')
df.loc[mask, 'methods_x'] = df.loc[mask, 'methods_y']

# same for results
mask = (df.results_y.notnull()) & (df.results_y != '')
df.loc[mask, 'results_x'] = df.loc[mask, 'results_y']

In [None]:
(df.results_x == '').sum(), df.results_x.isnull().sum()

In [None]:
(df.results_y == '').sum(), df.results_y.isnull().sum()

In [None]:
df.rename(columns = {'methods_x': 'methods', 'results_x': 'results'}, inplace=True)
df.drop(columns=['methods_y', 'results_y'], inplace=True)

In [None]:
df.rename(columns = {'paper_id_x': 'paper_id', 'source_x': 'source'}, inplace=True)

In [None]:
df.columns

In [None]:
df.head()

# Duplicates

Some paper ids are duplicated

In [None]:
len(df)

In [None]:
df.paper_id.nunique()

In [None]:
df[df.duplicated(subset=['paper_id'], keep=False)][['paper_id', 'body_text']]

But luckily they also have the same text body. So we will just keep one article per paper_id.
Check for example [https://www.sciencedirect.com/science/article/pii/S1386653209701295?via%3Dihub](https://www.sciencedirect.com/science/article/pii/S1386653209701295?via%3Dihub) and [https://www.sciencedirect.com/science/article/pii/S1386653209701325?via%3Dihub](https://www.sciencedirect.com/science/article/pii/S1386653209701325?via%3Dihub) - they have the same content.

In [None]:
df[df.duplicated(subset=['paper_id', 'body_text'], keep=False)].shape

In [None]:
df.drop_duplicates(['paper_id', 'body_text'], inplace=True)

In [None]:
len(df)

In [None]:
df[df.duplicated(['paper_id'], keep=False)].head(2)

In [None]:
df.drop_duplicates(['paper_id'], inplace=True)

In [None]:
df.paper_id.nunique()

In [None]:
df.shape

Now the paper_id is unique.

In [None]:
df.isnull().sum()

# Some new columns for convenience

In [None]:
# some new columns for convenience
df['publish_year'] = df.publish_time.str[:4].fillna(-1).astype(int) # 360 times None
# df['link'] = 'http://dx.doi.org/' + df.doi #dataset now has url column

In [None]:
df['is_covid19'] = df.body_text.str.contains('COVID-19|covid|sar cov 2|SARS-CoV-2|2019-nCov|2019 ncov|SARS Coronavirus 2|2019 Novel Coronavirus|coronavirus 2019| Wuhan coronavirus|wuhan pneumonia|wuhan virus', case=False)

In [None]:
df.is_covid19.sum()

# Language Detection to remove non-english articles and abstracts

In [None]:
from IPython.utils import io

with io.capture_output() as captured:
    !pip install scispacy
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
    !pip install spacy-langdetect
    !pip install spac scispacy spacy_langdetect https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.3/en_core_sci_lg-0.2.3.tar.gz

In [None]:
import scispacy
import spacy
import en_core_sci_lg
from spacy_langdetect import LanguageDetector

In [None]:
# medium model
nlp = en_core_sci_lg.load(disable=["tagger", "ner"])
nlp.max_length = 2000000
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [None]:
doc = nlp('This is some English text. Das ist ein Haus. This is a house.')
doc._.language

In [None]:
for s in doc.sents:
    print(s._.language)

In [None]:
doc = nlp(df[df.paper_id == '1a8a4dbbaa94ced4ef6af69ec7a09d3fa4c0eece'].body_text.iloc[0])

In [None]:
doc[:500]

In [None]:
doc_engl = ''
for s in doc.sents:
    if (s._.language['language'] == 'en'):
        doc_engl += s.text 

In [None]:
doc_engl[:2000]

Check language of each text body (only use the first 2000 characters).

In [None]:
df['text_language'] = df.body_text.apply(lambda x: nlp(str(x[:2000]))._.language['language'])

df.text_language.value_counts()

## Number of non-english texts to drop.

In [None]:
df.loc[df[df.text_language != 'en'].index].shape

In [None]:
df = df.drop(df[df.text_language != 'en'].index)

In [None]:
# Check language of all abstracts

# df['abstract_lang'] = df.abstract.apply(lambda x: nlp(str(x))._.language['language'])

#  df[df.abstract.isnull()]

In [None]:
# Number of non-english abstracts

# df[(df.abstract_lang != 'en') & (df.abstract.notnull())].abstract_lang.value_counts()

# Keep all english abstracts and those without abstract

# df = df[(df.abstract_lang == 'en') | (df.abstract.isnull())]

# df.shape

# df.paper_id.nunique()

# Analyze title/text body of the papers without abstract

# temp = df[df.abstract.isnull()].copy()

# def remove_non_english_sentences(doc):
#     doc = nlp(doc)
#     doc_engl = ''
#     for s in doc.sents:
#         if (s._.language['language'] == 'en'):
#             doc_engl += s.text 
#     return doc_engl

# remove_non_english_sentences(df[df.paper_id == '1a8a4dbbaa94ced4ef6af69ec7a09d3fa4c0eece'].body_text.iloc[0])

# temp['text_length'] = temp.body_text.apply(lambda x: len(x))

# temp['english_text'] = temp.body_text.apply(remove_non_english_sentences)

# temp['english_length'] = temp.english_text.apply(lambda x: len(x))

# temp.to_csv('df_english.csv', index=False)

# (temp.english_length/temp.text_length).hist()

# ((temp.english_length/temp.text_length)<0.8).sum()

# temp[((temp.english_length/temp.text_length)<0.8)].head()

# temp[temp.paper_id == '7925057cfe0cb75ae6079879cb2d22d23e42dfa5'].body_text.values[0][:500]

# temp[temp.paper_id == '617197cc751a9208cb0af1b4e31baeddc8d2e985'].body_text.values[0]

# temp[temp.paper_id == 'ca51b53fa512085e1aa166d5308602ff1666a90c'].body_text.values[0][:500]

# df = df.drop(temp[((temp.english_length/temp.text_length)<0.8)].index)

In [None]:
# temp['title_lang'] = df.title.apply(lambda x: nlp(str(x))._.language['language'])

# temp.title_lang.value_counts()

# Too many false-positves. 

# temp[temp.paper_id == '6f6b7b1efffae7f3765f29fe801ab63dd35110bb'].body_text.values[0]

# temp[temp.title_lang == 'de']

# We check the beginning of each text body instead.

# temp['text_lang'] = df.body_text.apply(lambda x: nlp(str(x[:2000]))._.language['language'])

# temp.text_lang.value_counts()

# Number of non-english texts to drop.

# df.loc[temp[temp.text_lang != 'en'].index].shape

# df = df.drop(temp[temp.text_lang != 'en'].index)

# Extract Study Design/ Methodological Keywords

In [None]:
# filter_dict = {
#     "discussion": ["conclusions","conclusion",'| discussion', "discussion",  'concluding remarks',
#                    'discussion and conclusions','conclusion:', 'discussion and conclusion',
#                    'conclusions:', 'outcomes', 'conclusions and perspectives', 
#                    'conclusions and future perspectives', 'conclusions and future directions'],
#     "results": ['executive summary', 'result', 'summary','results','results and discussion','results:',
#                 'comment',"findings"],
#     "introduction": ['introduction', 'background', 'i. introduction','supporting information','| introduction'],
#     "methods": ['methods','method','statistical methods','materials','materials and methods',
#                 'data collection','the study','study design','experimental design','objective',
#                 'objectives','procedures','data collection and analysis', 'methodology',
#                 'material and methods','the model','experimental procedures','main text',],
#     "statistics": ['data analysis','statistical analysis', 'analysis','statistical analyses', 
#                    'statistics','data','measures'],
#     "clinical": ['diagnosis', 'diagnostic features', "differential diagnoses", 'classical signs','prognosis', 'clinical signs', 'pathogenesis',
#                  'etiology','differential diagnosis','clinical features', 'case report', 'clinical findings',
#                  'clinical presentation'],
#     'treatment': ['treatment', 'interventions'],
#     "prevention": ['epidemiology','risk factors'],
#     "subjects": ['demographics','samples','subjects', 'study population','control','patients', 
#                'participants','patient characteristics'],
#     "animals": ['animals','animal models'],
#     "abstract": ["abstract", 'a b s t r a c t','author summary'], 
#     "review": ['review','literature review','keywords']}

In [None]:
study_designs = {'RCT': ['RCT', 'randomized controlled trial', 'randomised controlled trial', 'randomized control trial', 'randomised control trial',
                         'randomized clinical trial','randomised clinical trial'], 
                'time series analysis': ['time series analysis', 'time series', 'survival analysis'],
                'retrospective cohort': ['retrospective cohort'],
                'cross-sectional case-control': ['cross-sectional case-control', 'cross sectional case control', 'cross-sectional case control'],
                'prospective case-control': ['prospective case-control', 'prospective case control'],
                'matched case-control': ['matched case-control', 'matched case control'],
                'medical records review': ['medical records review'],
                'prevalence survey': ['prevalence survey'],
                'syndromic surveillance': ['syndromic surveillance'],
                'systematic review': ['systematic review'],
                'meta-analysis': ['meta-analysis', 'meta analysis', 'meta-syntheses'],
                'interventional study': ['interventional study'],
                'association': ['association', 'associated with'],
                 'p-value': ['p-value', 'p value'],
                 'pseudo-randomized controlled trial': ['pseudo-randomized controlled trial', 'pseudo-randomised controlled trial']
                }

Keywords from  [https://docs.google.com/spreadsheets/d/1t2e3CHGxHJBiFgHeW0dfwtvCG4x0CDCzcTFX7yz9Z2E/edit#gid=1217643351](https://docs.google.com/spreadsheets/d/1t2e3CHGxHJBiFgHeW0dfwtvCG4x0CDCzcTFX7yz9Z2E/edit#gid=1217643351)

In [None]:
generic_keywords = ['estimation',
 'prevalence survey',
 'response rate',
 'incidence',
 'psychometric evaluation of instrument',
 'median time to event',
 'pooled OR',
 'd-pooled',
 'randomized controlled trial',
 'non-randomized',
 'allocation method',
 'Cochrane review',
 'Cox proportional hazards',
 'gamma',
 'Weibull',
 'pseudo-randomised',
 'chart review',
 'log odds',
 'surveillance',
 'time-to-event analysis',
 'pooled adjusted odds ratio',
 'pooled relative risk',
 'data abstraction forms',
 'frequency',
 'etiology logistic regression',
 'exclusion criteria',
 'eligibility criteria',
 'right-censored',
 'pooled odds ratio',
 'non-comparative study',
 'medical records review',
 'CONSORT',
 'number of controls per case',
 'quasi-randomised',
 'risk of bias',
 'publication bias',
 'syndromic surveillance',
 'truncated',
 'longitudinal',
 'matching criteria',
 'double-blind',
 "Cohen's d",
 'registry data',
 'Adjusted Odds Ratio',
 'questionnaire development',
 'Kaplan-Meier',
 'heterogeneity',
 'recruitment',
 'randomization method',
 'censoring',
 'meta-analysis',
 'non-randomised',
 'β',
 'electronic medical records',
 'eligibility',
 'cross-sectional survey',
 'PRISMA',
 'prevalence',
 'inclusion criteria',
 'control arm',
 'protocol',
 'pooled risk ratio',
 'non-response bias',
 'baseline',
 'retrospective chart review',
 'survival analysis',
 'logistic regression',
 'blind',
 'exposure status',
 'randomized',
 'associated with',
 'lognormal',
 'systematic review',
 'RCT',
 'randomised',
 'survey instrument',
 'interrater reliability',
 'randomisation',
 'pooled RR',
 'hazard ratio',
 'AOR',
 'potential confounders',
 'treatment effect',
 'randomized clinical trial',
 'data collection instrument',
 'pooled AOR',
 'association',
 'power',
 "cohen's kappa",
 'pseudo-randomized',
 'treatment arm',
 'search string',
 'quasi-randomized',
 'cohort',
 'risk factors',
 'difference between means',
 'registry',
 'inter-rater reliability',
 'Odds Ratio',
 'placebo',
 'databases searched',
 'risk factor analysis',
 'difference in means',
 'random sample',
 'etiology',
 'i2']

In [None]:
for a in generic_keywords:
    if a not in [x for v in study_designs.values() for x in v]:
        study_designs[a] = [a]

In [None]:
len([x for v in study_designs.values() for x in v])

In [None]:
# def tag_study_design(study_designs):
#     df['study_design'] = [set() for _ in range(len(df))]
#     for tag in study_designs.keys():
#         for synonym in study_designs[tag]:
#             df[df.abstract.str.contains(synonym, case=False, na=False)].study_design.apply(lambda x: x.add(tag))

In [None]:
def tag_study_design(study_designs):
    df['study_abstract'] = [set() for _ in range(len(df))]
    df['study_methods'] = [set() for _ in range(len(df))]
    df['study_results'] = [set() for _ in range(len(df))]

    for tag in study_designs.keys():
        for synonym in study_designs[tag]:
            df[df.abstract.str.contains(synonym, case=False, na=False) | df.title.str.contains(synonym, case=False, na=False)].study_abstract.apply(lambda x: x.add(tag))
            df[df.methods.str.contains(synonym, case=False, na=False)].study_methods.apply(lambda x: x.add(tag))
            df[df.results.str.contains(synonym, case=False, na=False)].study_results.apply(lambda x: x.add(tag))
    
    df['study_design'] = df.apply(lambda x: list(x.study_abstract.union(x.study_methods).union(x.study_results)), axis=1)
    df.study_abstract = df.study_abstract.apply(lambda x: list(x))
    df.study_methods = df.study_abstract.apply(lambda x: list(x))
    df.study_results = df.study_results.apply(lambda x: list(x))

In [None]:
tag_study_design(study_designs)

In [None]:
df[df.study_design.str.len() != 0].tail(20).study_design

In [None]:
len(df.study_abstract[df.study_abstract.str.len() != 0])

In [None]:
len(df.study_methods[df.study_methods.str.len() != 0])

In [None]:
len(df.study_results[df.study_results.str.len() != 0])

In [None]:
len(df.study_design[df.study_design.str.len() != 0])

In [None]:
len(df.study_design[(df.study_design.str.len() != 0) & df.is_covid19])

In [None]:
df.drop(columns=['cord_uid', 'pmcid', 'pubmed_id', 'full_text_file', 'license', 'text_language',
                 'study_abstract', 'study_methods', 'study_results'], inplace=True)

# Export as .csv

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.to_csv('cord19_df.csv', index=False)