In [5]:
import os
import sys
sys.path.insert(0, '/home/ekaterina/Projects/COVID19/covid19_venv/lib/python3.6/site-packages')

import json
import glob
import time
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from extract_features_refactored import FeatureExtractor
from ranking import Ranking
from get_result import filtered_query, remove_punct, get_result

# New version

__instead of `get_bio_bert_embedding`__

In [6]:
''' What is known about transmission, incubation, and environmental stability?'''
task1 = [
    "Range of incubation periods for the disease in humans (and how this varies across age and health status) and how long individuals are contagious, even after recovery.",
    "Prevalence of asymptomatic shedding and transmission (e.g., particularly children).",
    "Seasonality of transmission.",
    "Physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding).",
    "Persistence and stability on a multitude of substrates and sources (e.g., nasal discharge, sputum, urine, fecal matter, blood).",
    "Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic).",
    "Natural history of the virus and shedding of it from an infected person",
    "Implementation of diagnostics and products to improve clinical processes",
    "Disease models, including animal models for infection, disease and transmission",
    "Tools and studies to monitor phenotypic change and potential adaptation of the virus",
    "Immune response and immunity",
    "Effectiveness of movement control strategies to prevent secondary transmission in health care and community settings"
    "Effectiveness of personal protective equipment (PPE) and its usefulness to reduce risk of transmission in health care and community settings",
    "Role of the environment in transmission" 
    ]

In [7]:
fe = FeatureExtractor(bert_config_file ="./models/biobert_v1.1_pubmed/bert_config.json",
                      init_checkpoint = "./models/biobert_v1.1_pubmed/model.ckpt-1000000",
                      vocab_file = "./models/biobert_v1.1_pubmed/vocab.txt",
                      batch_size = 32, # Batch size for predictions
                      max_seq_length = 128, # Sequences longer than this will be truncated, and sequences shorter than this will be padded.
                      verbose=0)





In [8]:
processed_filepath = "Data/processed_data_v5.csv"
df_processed = pd.read_csv(processed_filepath, delimiter = ';')

ranking = Ranking('texts')

In [9]:
query_arr = [
            ' Range of incubation period general population',
            ' Range of incubation period by age group',
            ' Range of incubation period by health status',
            ' How long are individuals contagious even after recover recovery',
            ]

In [11]:
doc_k = 5
query = 'Tools and studies to monitor phenotypic change and potential adaptation of the virus'
for query_text in query_arr[0:1]:
    query = filtered_query(query_text)
    print("Query text is:", query,"\n")
    selected_paper_id = df_processed[(df_processed.after_dec == True) & (df_processed.tag_disease_covid == True)].paper_id
    scores = ranking.get_bm25_scores(query, selected_paper_id)
    print('scores computed')
    ranking_nearest = [a for a, b in sorted(scores.items(), key=lambda x: x[1], reverse=True) 
                       if np.all(df_processed.loc[df_processed.paper_id == a, ["tag_disease_covid", "after_dec"]].values)][:doc_k]      
    print('Ranking computed')
    query_encoding = fe.prepare_embedding_csv(query_text,None,False).values
    print('Query encoded')

    for paper_id in ranking_nearest:
        print(paper_id)
        similar_paragr = fe.get_closest_sentence(query_encoding, paper_id, 
                                                 df_processed.loc[df_processed.paper_id == paper_id, "text"].values[0], 
                                                 topk=10)

        for (txt,score) in similar_paragr:
            print(score, "\t", txt, "\n")

Query text is: Range incubation period general population 

scores computed
Ranking computed
Query encoded
djq0lvr2
[0.9264219] 	 The incubation period is essential in making intervention strategies to control infectious diseases. 

[0.91281375] 	 The incubation period is essential in the control of infectious diseases. To halt a pandemic, a suitable quarantine period should be set by investigating incubation periods. Thus, understanding the characteristics of the incubation period is crucial to design public health efforts. In this cohort, the full range of incubation periods of the Covid-19 cases ranged from 0 to 33 days among 2015 cases. Both male and female adults had their median incubation periods of 7-day, which is 1.8-day longer than those in other reports 3,4 (see discussion). However, a similar result was released in the recent news from the Chinese Medical Association that the median incubation periods were 5-7 days 5 . The median incubation period (9-day) of children were s

[0.89735939] 	 First, to estimate the probability that an infected individual would be detected or missed we considered a range of plausible values for the mean incubation time, and the fraction of subclinical cases. We focus on the incubation period and subclinical fraction of cases because screening outcomes are particularly sensitive to their values. All other parameters were fixed to the best available estimates listed in Table 1. 

[0.89316508] 	 Even within the narrow range tested, screening outcomes were sensitive to the incubation period mean. For longer incubation periods, we found that larger proportions of departing travellers would not yet be exhibiting symptoms – either at departure or arrival – which in turn reduced the probability that screening would detect these cases, especially since we assume few infected travellers will realize they have been exposed to COVID-19. 

[0.89020059] 	 First, the model estimated the probability that any single infected individual would b

In [12]:
get_result(df_processed, query=query_arr[0],  ranking=ranking, 
           inforet_tuple=("BERT", fe), doc_k = 5, sent_k = 3)

Unnamed: 0,paper_id,title,abstract,text,date,authors,url,after_dec,tag_disease_covid,tag_disease_sars,...,tag_design_cross_sectional_case_control,tag_design_matched_case_control,tag_design_prevalence_survey,tag_design_time_series_analysis,tag_design_systematic_review,tag_design_randomized_control,tag_design_pseudo_randomized_control,tag_design_case_study,tag_design_simulation,sentences
0,djq0lvr2,Is a 14-day quarantine period optimal for effe...,Background The outbreak of a new coronavirus (...,The outbreak of a new coronavirus (SARS-CoV-2)...,2020-03-18,Xue Jiang; Yawei Niu; Xiong Li; Lin Li; Wenxia...,https://doi.org/10.1101/2020.03.15.20036533,True,True,True,...,False,False,False,False,False,False,False,True,False,[The incubation period is essential in making ...
1,j0nm444m,Effective containment explains sub-exponential...,The recent outbreak of COVID-19 in Mainland Ch...,The current outbreak of the new coronavirus in...,2020-02-20,Benjamin F Maier; Dirk Brockmann,https://doi.org/10.1101/2020.02.18.20024414,True,True,False,...,False,False,False,False,False,False,False,False,True,[We introduce public containment leverage P = ...
2,x22rc60j,Estimated effectiveness of traveller screening...,Traveller screening is being used to limit fur...,Our previous analysis considered the contribut...,2020-01-30,Katelyn Gostic; Ana C. R. Gomez; Riley O. Mumm...,https://doi.org/10.1101/2020.01.28.20019224,True,True,False,...,False,False,False,False,False,False,False,True,False,[Overall screening effectiveness in a populati...
3,oee19duz,Estimated effectiveness of symptom and risk sc...,Traveller screening is being used to limit fur...,"As of February 20, 2020, the 2019 novel corona...",2020-02-24,"Gostic, Katelyn; Gomez, Ana CR; Mummah, Riley ...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,True,True,False,...,False,False,False,False,False,False,False,True,False,"[Even within the narrow range tested, screenin..."
4,8anqfkmo,The Incubation Period of Coronavirus Disease 2...,"BACKGROUND: A novel human coronavirus, severe ...",We searched for news and public health reports...,2020-03-10,"Lauer, Stephen A.; Grantz, Kyra H.; Bi, Qifang...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,True,True,False,...,False,False,False,False,False,False,False,True,False,"[incubation period. In data sets such as ours,..."


SyntaxError: invalid syntax (<ipython-input-3-f77fbf84c36b>, line 1)