In [None]:
from __future__ import unicode_literals, print_function

import numpy as np 
import pandas as pd 
import json
import glob
import itertools
import logging


import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from tqdm import tqdm, tqdm_notebook

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# We will use provided dataset for training and testing our ner model.
# It is easily possible to replace current testing data with search 
# engine articles for each question and get summary tables. 


path = r'../input/CORD-19-research-challenge/Kaggle/target_tables/1_population/'
all_files = glob.glob(path + "/*.csv")

temp_df = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    temp_df.append(df)

df_all_provided_summary_tables = pd.concat(temp_df, axis=0, ignore_index=True)
# this df_all_provided_summary_tables dataframe will be using to train and test our NER model. 
# this dataframe contains all the pre summary tables curated by experts 

In [None]:
# import metadata file where all the covid19 research paper metadata stored
df_metadata = pd.read_csv('../input/CORD-19-research-challenge/metadata.csv',
                          low_memory=False)
# there are some rows which contains multiple entry for location of articles
# we will remove those and keep first
df_metadata.pdf_json_files = df_metadata.pdf_json_files.apply(
    lambda x: x.split(';')[0] if pd.notnull(x) else x)
df_metadata.pmc_json_files = df_metadata.pmc_json_files.apply(
    lambda x: x.split(';')[0] if pd.notnull(x) else x)

In [None]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        print(str(e))
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [None]:
def get_raw_articles_by_title(provided_titles, metadata,
                articles_base_location='../input/CORD-19-research-challenge/'):
    '''
    get raw articles by title
    
    
    provided_titles: string
    metadata: dataframe
    
    return: datframe
        'title', 'doi', 'publish_time', 'journal', 'url', 'abstract', 'body_text'
    '''
    methods = ['methods','method','statistical methods','materials',
               'materials and methods','data collection','the study',
               'study design','experimental design','objective',
               'objectives','procedures','data collection and analysis',
               'methodology','material and methods','the model',
               'experimental procedures','main text']
    
    metadata_filtered = metadata.loc[metadata.title.isin(provided_titles)]
    
    # replace empty pdf_json_files column with pmc_json_files column value
    metadata_filtered['pdf_json_files'] = \
    metadata_filtered.pdf_json_files.fillna(metadata_filtered.pmc_json_files)
    # drop those rows that doesn't have location of articles 
    metadata_filtered = metadata_filtered.dropna(subset=['pdf_json_files'])
    # create articles location for reading articles
    metadata_filtered['articles_location'] = articles_base_location \
    + metadata_filtered['pdf_json_files']
    
    metadata_filtered['body_text'] = '' # create a column for articles body text
    metadata_filtered['methods'] = ''
    metadata_filtered['results'] = ''
    # fill body_text column
    for index, row in metadata_filtered.iterrows():
        temp_body_text = ''
        temp_methods = ''
        temp_results = ''
        with open(row['articles_location']) as file:
            content = json.load(file)
            for entry in content['body_text']:
                temp_body_text = temp_body_text + entry['text']
            # Methods
            for entry in content['body_text']:
                section_title = ''.join(
                    x.lower() for x in entry['section'] \
                    if x.isalpha()) #remove numbers and spaces
                if any(m in section_title for m in [''.join(
                    x.lower() for x in m \
                    if x.isalpha()) for m in methods]) : 
                    temp_methods = temp_methods + entry['text']
            # Results
            results_synonyms = ['result', 'results']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] \
                                        if x.isalpha())
                if any(r in section_title for r in results_synonyms) :
                    temp_results = temp_results + entry['text']
                    
        metadata_filtered.at[index, 'body_text'] = temp_body_text
        metadata_filtered.at[index, 'methods'] = temp_methods
        metadata_filtered.at[index, 'results'] = temp_results
        
    metadata_filtered = metadata_filtered.rename(
        columns={'title': 'Study', 'publish_time': 'Date'})
    return metadata_filtered[['Study', 'doi', 'Date',
                              'journal', 'url', 'abstract',
                              'methods', 'results', 'body_text']]




def preprocess_articles(raw_articles_dataframe):
    '''
    clean abstract, body text for performance
    
    raw_articles_dataframe: dataframe
        this dataframe should contain articles abstract,
        methods, results, body_text.
        ideal dataframe is the return of 
        get_raw_articles_by_title() function
    
    '''
    raw_articles_dataframe['abstract'] = \
    raw_articles_dataframe['abstract']\
    .fillna(raw_articles_dataframe.body_text.str[:1500])
    
    raw_articles_dataframe['shorten_full_article_text'] = \
    raw_articles_dataframe['Study'] \
    + "\n\n" + raw_articles_dataframe['abstract'] \
    + "\n\n" + raw_articles_dataframe['methods'] \
    + "\n\n" + raw_articles_dataframe['results']
    
    
    # remove (), [] and all text between baraces and normalize whitespace
    raw_articles_dataframe['shorten_full_article_text'] = \
    raw_articles_dataframe['shorten_full_article_text']\
    .str.replace(r"\s*([\(\[]).*?([\)\]])","").str.strip()
    
    # remove all urls from text
    raw_articles_dataframe['shorten_full_article_text'] = \
    raw_articles_dataframe['shorten_full_article_text']\
    .str.replace(r"http\S+|www.\S+","").str.strip()
    
    # remove all single digit number
    raw_articles_dataframe['shorten_full_article_text'] = \
    raw_articles_dataframe['shorten_full_article_text']\
    .str.replace(r"(?<!\d)[1-7]\b","").str.strip()
    
    
    
    

    
    return raw_articles_dataframe


def generate_articles_for_annotation(processed_articles_dataframe):
    '''
    this function generate text for annotation
    input is the dataframe that contains process full articles text
    this function just make .txt file for each row of processed
    dataframe in preprocess_articles() function
    
    note: hold some data from processed_articles_dataframe before providing
    to this function and you can use it in later for testing the model
    
    '''
    temp = pd.DataFrame(columns=['articles'])
    temp['articles'] = processed_articles_dataframe['shorten_full_article_text']
    temp = temp.dropna()
    
    temp = temp.reset_index(drop=True)
    
    file = './{}.txt'
    for i, row in temp.iterrows():
        with open(file.format(str(i)), 'w') as f:
            f.write(str(row['articles']))
            
    return "TEXT SAVE TO WORKING DIRECTORY"

def training_ner_model(training_data, model=None, output_dir='./', n_iter=500):
    TRAIN_DATA = training_data.copy()
    print('Training started...')
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in tqdm_notebook(range(n_iter)):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    print('Training completed.')
    return nlp


# function for extracting data field value
def _ner_apply(text, ner_model):
    # pass our text to spacy
    # it will return us doc (spacy doc)
    doc = ner_model(text)
    # return list of tuples look like 
    # this [('four-year', 'EDUCATION_YEARS'), ('college or university', 'SCHOOL_TYPE')]
    return [(ent.text, ent.label_) for ent in doc.ents]

def ner_extraction(model, processed_articles):
    
    """
    return full extracted dataset 
    """
    temp = processed_articles.copy()
    # apply the function and store the result in a new column 
    temp['temp_entity'] = temp['shorten_full_article_text'].apply(lambda x: _ner_apply(x, ner_model=model))


    # process our data field column and seperate each column and store their value in their column
    flatter = sorted([list(x) + [idx] for idx, y in enumerate(temp['temp_entity']) 
                      for x in y], key = lambda x: x[1]) 

    # Find all of the values that will eventually go in each F column                
    for key, group in itertools.groupby(flatter, lambda x: x[1]):
        list_of_vals = [(val, idx) for val, _, idx in group]

        # Add each value at the appropriate index and F column
        for val, idx in list_of_vals:
            temp.loc[idx, key] = val
    return temp



# get_raw_articles_by_title(df_all_provided_summary_tables.Study.tolist(), df_metadata)

In [None]:
annotation_for_training = convert_dataturks_to_spacy('../input/covid19-annotation/Cord19_1_population_annotation.json')
ner_model = training_ner_model(annotation_for_training)

In [None]:
ner_model.to_disk('./task_1_ner_model')
