In [1]:
import os
import pandas as pd
import pickle
import json
import glob
from tqdm import tqdm
from langdetect import detect

In [3]:
# Read data folder
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 120)

In [4]:
currect_path = os.getcwd()
folder_path = '\\archive_2'
path_data = currect_path+'\\data'
path_fig = currect_path+'\\figure'
path_model = currect_path+'\\model'

In [6]:
df_meta = pd.read_csv(path_data+'\\metadata.csv', dtype={'pubmed_id': str, 'mag_id': str, 'doi': str, 'pmcid': str,
                                                         'who_covidence_id': str, 'arxiv_id': str,
                                                         'pmc_json_files': str}, low_memory=False)

In [6]:
df_meta.drop(['mag_id', 'arxiv_id'], axis=1, inplace=True)

In [7]:
pdf_json = glob.glob(path_data+'\\document_parses\\pdf_json\\*.json', recursive=True)
pmc_json = glob.glob(path_data+'\\document_parses\\pmc_json\\*.json', recursive=True)

In [8]:
def read_pdf_json(input):
    with open(input) as f:
        data = json.load(f)

        # ADD FULL ABSTRACT
        abstract = []
        for temp in data['abstract']:
            abstract.append(temp['text'])
        abstract = ' '.join(abstract)

        # ADD BODY TEXT
        body_text = []
        for temp in data['body_text']:
            body_text.append(temp['text'])
        body_text = ' '.join(body_text)

        # ADD LAST NAME & INSTITUTION & COUNTRY
        last_name = []
        institution = []
        country = []
        for temp in data['metadata']['authors']:
            try:
                last_name.append(temp['last'])
                institution.append(temp['affiliation']['institution'])
                country.append(temp['affiliation']['location']['country'])
            except Exception:
                pass

        out = {'paper_id': data['paper_id'],
                          'title': data['metadata']['title'],
                          'authors': last_name,
                          'institution': set(institution),
                          'country': set(country),
                          'abstract': abstract,
                          'body_text': body_text}
        
        df = pd.DataFrame([out], columns=out.keys())

    return df

In [9]:
# Read pdf json
df_pdf = pd.DataFrame(columns=['paper_id','title','authors','institution','country','abstract','body_text'])

for i in tqdm(pdf_json):
    try:
        df_pdf = pd.concat([df_pdf, read_pdf_json(i)], axis =0, ignore_index=True)
    except Exception:
        pass

100%|████████████████████████████████████████████████████████████████████████| 256803/256803 [2:26:30<00:00, 29.21it/s]


In [10]:
pickle.dump(df_pdf, open(path_data+'\\pdf_json.sav', 'wb'))

In [11]:
def read_pmc_json(input):
    with open(input) as f:
        data = json.load(f)

        # ADD BODY TEXT
        body_text = []
        for temp in data['body_text']:
            body_text.append(temp['text'])
        body_text = ' '.join(body_text)

        # ADD LAST NAME & INSTITUTION & COUNTRY
        last_name = []
        for temp in data['metadata']['authors']:
            last_name.append(temp['last'])

        out = {'paper_id': data['paper_id'],
                          'title': data['metadata']['title'],
                          'authors': last_name,
                          'body_text': body_text}
        
        df = pd.DataFrame([out], columns=out.keys())

    return df

In [12]:
# Read pmc json
df_pmc = pd.DataFrame(columns=['paper_id','title','authors','body_text'])
for i in tqdm(pmc_json):
    try:
        df_pmc = pd.concat([df_pmc, read_pmc_json(i)], axis =0, ignore_index=True)
    except Exception:
        pass

100%|████████████████████████████████████████████████████████████████████████| 197394/197394 [1:03:26<00:00, 51.85it/s]


In [None]:
pickle.dump(df_pmc, open(path_data+'\\pmc_json.sav', 'wb'))

In [7]:
# LOAD META-DATA
df_meta = pd.read_csv(path_data+'\\metadata.csv', dtype={'pubmed_id': str, 'mag_id': str, 'doi': str, 'pmcid': str,
                                                         'who_covidence_id': str, 'arxiv_id': str, 'pmc_json_files': str})

# DROP 2 COLUMNS THAT HAVE 98.7% AND 100% MISSING VALUES
df_meta.drop(['mag_id','arxiv_id','url','pdf_json_files','pmc_json_files','license'], axis=1, inplace=True)

# LOAD PDF_JSON AND PMC_JSON
df_pdf = pickle.load(open(path_data+'\\pdf_json.sav', 'rb'))
df_pmc = pickle.load(open(path_data+'\\pmc_json.sav', 'rb'))

df = pd.merge(df_meta, df_pdf.drop(['title','authors','abstract'], axis=1), left_on='sha', right_on='paper_id', how='left')
df.drop('paper_id', axis=1, inplace=True)
df = pd.merge(df, df_pmc.drop(['title','authors'], axis=1), left_on='pmcid', right_on='paper_id', how='left')
df.drop(['cord_uid','pubmed_id','s2_id','who_covidence_id','paper_id'], axis=1, inplace=True)

# ONLY SELECT DATA WITH NOT-NULL BODY TEXT
df = df[~df.body_text_x.isnull() | ~df.body_text_y.isnull()]

# REMOVE NON-ENGLISH ARTICLE
def language_detect(data):
    languages = []
    for i in tqdm(df.title):
        try:
            languages.append(detect(i))
        except:
            languages.append('unknown')
    return languages

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
# ONLY SELECT ENGLISH ARTICLES
df['languages'] = language_detect(df.title)
df = df[df['languages']=='en']

100%|█████████████████████████████████████████████████████████████████████████| 251123/251123 [19:31<00:00, 214.31it/s]


In [9]:
df['languages'] = language_detect(df.body_text_x)
df = df[df['languages']=='en']

100%|█████████████████████████████████████████████████████████████████████████| 240348/240348 [19:09<00:00, 209.05it/s]


In [10]:
df['languages'] = language_detect(df.body_text_y)
df = df[df['languages']=='en']

100%|█████████████████████████████████████████████████████████████████████████| 239777/239777 [19:21<00:00, 206.51it/s]


In [11]:
df.reset_index(drop=True, inplace=True)
df.drop('languages', axis=1, inplace=True)

In [12]:
# LOWER CASE
# for i in ['title','abstract','body_text_x','body_text_y']:
#     df[i] = df[i].str.strip().str.lower()

In [13]:
# REMOVE STOP WORDS
# from nltk.corpus import stopwords
# words = stopwords.words('english')
# filtered_words = [word for word in word_list if word not in stopwords.words('english')]

In [14]:
# MERGE
from sklearn.metrics import jaccard_score

def sentence2list(lst):
    return (lst[0].split())

def jaccard_similarity(list1, list2):
    list1 = sentence2list(list1)
    list2 = sentence2list(list2)
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [15]:
for i in tqdm(range(len(df))):
    try:
        df.loc[i, 'jaccard_similarity'] = jaccard_similarity(df.body_text_x[i], df.body_text_y[i])
    except:
        df.loc[i, 'jaccard_similarity']


df['body_text'] = df.apply(lambda row: str(row['body_text_x'])+'; '+str(row['body_text_y']) if row['jaccard_similarity']<0.8 else row['body_text_y'], axis=1)
df.drop(['body_text_x','body_text_y','jaccard_similarity'], axis=1, inplace=True)

100%|█████████████████████████████████████████████████████████████████████████| 239414/239414 [04:17<00:00, 931.40it/s]


In [16]:
pickle.dump(df, open(path_data+'\\data.sav', 'wb'))

In [5]:
df = pickle.load(open(path_data+'\\data.sav', 'rb'))

In [6]:
# CLEANING TITLE, ABSTRACT & BODY_TEXT
c_var = ['title','abstract','body_text']
for i in tqdm(c_var):
    # Remove website url, [], doble space
    df[i] = df[i].str.strip().str.replace(r'^((https?|ftp|smtp):\/\/)?(www.)?[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*$','', regex=True).replace("[\[].*?[\)\]]", "", regex=True).replace("  ","", regex=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:40<00:00, 33.53s/it]


In [7]:
# Remove NaN
df = df.dropna(axis=0, subset=['abstract','body_text']).reset_index(drop=True)
pickle.dump(df, open(path_data+'\\cleaned_data.sav', 'wb'))