In [1]:
import os
import pandas as pd
import pickle
import json
import glob
from tqdm import tqdm
from langdetect import detect
from spacy.lang.en.stop_words import STOP_WORDS
import string

In [2]:
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 120)

In [3]:
currect_path = os.getcwd()
folder_path = '\\archive_2'
path_data = currect_path+'\\data'
path_fig = currect_path+'\\figure'
path_model = currect_path+'\\model'
df_meta = pd.read_csv(path_data+'\\metadata.csv', dtype={'pubmed_id': str, 'mag_id': str, 'doi': str, 'pmcid': str,
                                                         'who_covidence_id': str, 'arxiv_id': str,
                                                         'pmc_json_files': str}, low_memory=False)

In [4]:
df_meta.drop(['mag_id', 'arxiv_id'], axis=1, inplace=True)

In [6]:
pdf_json = glob.glob(path_data+'\\document_parses\\pdf_json\\*.json', recursive=True)
pmc_json = glob.glob(path_data+'\\document_parses\\pmc_json\\*.json', recursive=True)

In [7]:
def read_pdf_json(input):
    out = pd.DataFrame()
    with open(input) as f:
        data = json.load(f)

        # ADD FULL ABSTRACT
        abstract = []
        for temp in data['abstract']:
            abstract.append(temp['text'])
        abstract = ' '.join(abstract)

        # ADD BODY TEXT
        body_text = []
        for temp in data['body_text']:
            body_text.append(temp['text'])
        body_text = ' '.join(body_text)

        # ADD LAST NAME & INSTITUTION & COUNTRY
        last_name = []
        institution = []
        country = []
        for temp in data['metadata']['authors']:
            try:
                last_name.append(temp['last'])
                institution.append(temp['affiliation']['institution'])
                country.append(temp['affiliation']['location']['country'])
            except Exception:
                pass

        out = out.append({'paper_id': data['paper_id'],
                          'title': data['metadata']['title'],
                          'authors': last_name,
                          'institution': set(institution),
                          'country': set(country),
                          'abstract': abstract,
                          'body_text': body_text}, ignore_index=True)

    return out

In [8]:
df_pdf = pd.DataFrame(columns=['paper_id','title','authors','institution','country','abstract','body_text'])
for i in tqdm(pdf_json):
    try:
        df_pdf = df_pdf.append(read_pdf_json(i), ignore_index=True)
    except Exception:
        pass

100%|████████████████████████████████████████████████████████████████████████| 256803/256803 [2:26:49<00:00, 29.15it/s]


In [9]:
pickle.dump(df_pdf, open(path_data+'\\pdf_json.sav', 'wb'))

In [10]:
def read_pmc_json(input):
    out = pd.DataFrame()
    with open(input) as f:
        data = json.load(f)

        # ADD BODY TEXT
        body_text = []
        for temp in data['body_text']:
            body_text.append(temp['text'])
        body_text = ' '.join(body_text)

        # ADD LAST NAME & INSTITUTION & COUNTRY
        last_name = []
        for temp in data['metadata']['authors']:
            last_name.append(temp['last'])

        out = out.append({'paper_id': data['paper_id'],
                          'title': data['metadata']['title'],
                          'authors': last_name,
                          'body_text': body_text}, ignore_index=True)

    return out

In [11]:
df_pmc = pd.DataFrame(columns=['paper_id','title','authors','body_text'])
for i in tqdm(pmc_json):
    try:
        df_pmc = df_pmc.append(read_pmc_json(i), ignore_index=True)
    except Exception:
        pass

100%|████████████████████████████████████████████████████████████████████████| 197394/197394 [1:12:31<00:00, 45.36it/s]


In [14]:
pickle.dump(df_pmc, open(path_data+'\\pmc_json.sav', 'wb'))

In [4]:
# LOAD META-DATA
df_meta = pd.read_csv(path_data+'\\metadata.csv', dtype={'pubmed_id': str, 'mag_id': str, 'doi': str, 'pmcid': str,
                                                         'who_covidence_id': str, 'arxiv_id': str, 'pmc_json_files': str})

# DROP 2 COLUMNS THAT HAVE 98.7% AND 100% MISSING VALUES
df_meta.drop(['mag_id','arxiv_id','url','pdf_json_files','pmc_json_files','license'], axis=1, inplace=True)

# LOAD PDF_JSON AND PMC_JSON
df_pdf = pickle.load(open(path_data+'\\pdf_json.sav', 'rb'))
df_pmc = pickle.load(open(path_data+'\\pmc_json.sav', 'rb'))

df = pd.merge(df_meta, df_pdf.drop(['title','authors','abstract'], axis=1), left_on='sha', right_on='paper_id', how='left')
df.drop('paper_id', axis=1, inplace=True)
df = pd.merge(df, df_pmc.drop(['title','authors'], axis=1), left_on='pmcid', right_on='paper_id', how='left')
df.drop(['cord_uid','pubmed_id','s2_id','who_covidence_id','paper_id'], axis=1, inplace=True)

# ONLY SELECT DATA WITH NOT-NULL BODY TEXT
df = df[~df.body_text_x.isnull() | ~df.body_text_y.isnull()]

# REMOVE NON-ENGLISH ARTICLE
def language_detect(data):
    languages = []
    for i in tqdm(df.title):
        try:
            languages.append(detect(i))
        except:
            languages.append('unknown')
    return languages

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
# ONLY SELECT ENGLISH ARTICLES
df['languages'] = language_detect(df.title)
df = df[df['languages']=='en']

100%|█████████████████████████████████████████████████████████████████████████| 251103/251103 [19:49<00:00, 211.15it/s]


In [6]:
df['languages'] = language_detect(df.body_text_x)
df = df[df['languages']=='en']

100%|█████████████████████████████████████████████████████████████████████████| 240314/240314 [17:59<00:00, 222.59it/s]


In [7]:
df['languages'] = language_detect(df.body_text_y)
df = df[df['languages']=='en']

100%|█████████████████████████████████████████████████████████████████████████| 239730/239730 [17:55<00:00, 222.99it/s]


In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:
df.drop('languages', axis=1, inplace=True)

# LOWER CASE BODY_TEXT X & Y, REMOVE STOP WORDS
for i in ['title','abstract','body_text_x','body_text_y']:
    df[i] = df[i].str.strip().str.lower()

def sentence2list(lst):
    return (lst[0].split())

punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [10]:
from sklearn.metrics import jaccard_score

def jaccard_similarity(list1, list2):
    list1 = sentence2list(list1)
    list2 = sentence2list(list2)
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

# index=4
# jaccard_similarity(df.body_text_x[index], df.body_text_y[index])

for i in tqdm(range(len(df))):
    try:
        df.loc[i, 'jaccard_similarity'] = jaccard_similarity(df.body_text_x[i], df.body_text_y[i])
    except:
        df.loc[i, 'jaccard_similarity']


df['body_text'] = df.apply(lambda row: str(row['body_text_x'])+'; '+str(row['body_text_y']) if row['jaccard_similarity']<0.8 else row['body_text_y'], axis=1)
df.drop(['body_text_x','body_text_y','jaccard_similarity'], axis=1, inplace=True)

pickle.dump(df, open(path_data+'\\cleaned_data.sav', 'wb'))

100%|████████████████████████████████████████████████████████████████████████| 239393/239393 [03:49<00:00, 1040.93it/s]


In [11]:
df = pickle.load(open(path_data+'\\cleaned_data.sav', 'rb'))

# CLEANING TITLE, ABSTRACT & BODY_TEXT
c_var = ['title','abstract','body_text']
for i in tqdm(c_var):
    df[i] = df[i].str.strip().str.lower().replace(r'https?:\S+\sdoi','', regex=True).replace("[\[].*?[\)\]]", "", regex=True).replace("  ","", regex=True)

pickle.dump(df, open(path_data+'\\cleaned_data.sav', 'wb'))

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [06:47<00:00, 135.82s/it]


In [12]:
df = pickle.load(open(path_data+'\\cleaned_data.sav', 'rb'))
df = df.dropna(axis=0, subset=['abstract','body_text']).reset_index(drop=True)

pickle.dump(df, open(path_data+'\\cleaned_data_no_null.sav', 'wb'))