# Pattern Sanity Check on CORE Abstracts

In [39]:
%load_ext autoreload
%autoreload 2

## Preprocessing

In [40]:
import json
from octoflow.core import lemmatize
import pandas as pd

with open("downloads/sample_4k.json") as f:
    data = json.loads(f.read())
    
d = dict(data)
abst = [entry for entry in d["data"] if entry["abstract"]]
len(abst), abst[0]

(2956,
 {'doi': '10.1029/2006GL027886',
  'coreId': '71102',
  'oai': 'oai:eprints.lancs.ac.uk:6706',
  'identifiers': ['oai:eprints.lancs.ac.uk:6706', '10.1029/2006GL027886'],
  'title': 'Transport of plasma sheet material to the inner magnetosphere',
  'authors': ['Denton, M. H.',
   'Thomsen, M. F.',
   'Borovsky, J. E.',
   'Lavraud, B.',
   'Henderson, M. G.',
   'Skoug, R. M.',
   'Funsten, H. O.',
   'Jahn, J.M.',
   'Pollock, C. J.',
   'Weygand, J. M.'],
  'enrichments': {'references': [], 'documentType': {'type': None}},
  'contributors': [],
  'datePublished': '2007',
  'abstract': 'The reaction of the plasma sheet in response to an increase in magnetospheric convection is examined using a combination of energetic neutral atom (ENA) imaging and in situ observations. Data from the IMAGE/MENA instrument are examined in conjunction with observations from the magnetospheric plasma analyzer (MPA) instrument onboard the Los Alamos 1994-084 satellite located in geosynchronous orbit

In [41]:
df = pd.DataFrame(abst)[["title", "doi", "abstract", "journals"]] #fulltext

## CORE classifier Test

In [49]:
import spacy
nlp = spacy.load("en_core_web_md")

In [42]:
import pandas as pd 
import fastai

fastai.__version__, len(df)

('2.5.2', 2956)

In [44]:
learn = load_learner("oct15_40k.pkl")

In [103]:
dfx = pd.DataFrame([], columns=list(df.columns) + ["text", "pred"])

In [104]:
dfx

Unnamed: 0,title,doi,abstract,journals,text,pred


In [None]:
for index, row in df.iterrows():
    for sent in nlp(row.abstract).sents:
        out = learn.predict(sent.text)
        predict= int(out[0])
        conf0 = round(out[2][0].item(),2)
        conf1 = round(out[2][1].item(),2)
        dfx = dfx.append(
             {
            'title':row.title,
            'doi':row.doi,
            'abstract':row.abstract,
            "journals":row.journals,
                 
                 "text": sent.text,
            "predict": predict,
            "conf0": conf0,
            "conf1": conf1
             }, 
        
        ignore_index=True)
 #   print(dfx.append(row))
 

#[learn.predict(s.text) for s in nlp(df["abstract"][0]).sents], ["text"], columns=["pred"])
dfx

█

In [None]:
dfx.to_csv("core_predicts.csv")

In [45]:
learn.predict("Anaphylaxis is a life-threatening emergency of which reliable epidemiological data are lacking")

█

('0', tensor(0), tensor([0.8883, 0.1117]))

In [None]:
df_test["predicted"] = [int(p[0]) for p in predicts]
df_test["confidence_0"] =  [round(p[2][0].item(),2) for p in predicts]
df_test["confidence_1"] =  [round(p[2][1].item(),2) for p in predicts]

## Check Matching Sentence Quality

In [None]:
df["lemma_text"] = df["abstract"].map(lemmatize)

In [None]:
df.to_csv("downloads/core_3k_lemma_abst_only.csv")

In [4]:
from octoflow.core import lemmatize, find_sentence_in_abstract, split_into_sentences, sentence_has_phrase
import pandas as pd

df = pd.read_csv("downloads/core_3k_lemma_abst_only.csv")
with open("ngrams_pos_only_lemmatized.txt") as f:
    ngrams = f.read().split("\n")
    
# trigrams = [gram for gram in ngrams if len(gram.split(" "))> 2]
# bigrams = [gram for gram in ngrams if len(gram.split(" ")) == 2]

In [5]:
hits = df[df["lemma_text"].str.contains("|".join(ngrams))]

In [8]:
hits["lemma_text"]

11      this article discuss some of the merit and dem...
38      aim : a systematic literature review of the ex...
44      as multimedia computing become the order of th...
57      the number of broadband user have beengrowe ra...
61      electrolytic hydrogen production be an efficie...
85      this paper be publish as Patient - center e - ...
89      accurate assessment of animal emotion ( affect...
94      in this paper we review and analyse scenario p...
99      the debate over the nature of egalitarianism h...
105     significant investment be be make in the appli...
113     about the book : this collection of paper aim ...
118     in Spain , a grow body of literature have draw...
139     this paper investigate how specific notion of ...
151     a simple sensor method be develop for aflatoxi...
161     < p > image segmentation be an indispensable p...
169     concern over the impact of debt on participati...
173     my research project revolve around the problem...
185     this p

In [7]:
hits.to_csv("downloads/hits.csv")

In [9]:
from pydash import find_index

def find_sentence_in_abstract(paragraph, bias=0):
    """checks paragraph for key sentences. Returns first matching hit"""
    p_sents = split_into_sentences(paragraph)
    idx = find_index(p_sents, sentence_has_phrase)
    if idx < 0:
        return ""
    
    i = idx + bias #bias- for sentence before or after
    if i < 0:
        return ""
    return get(p_sents, i)

In [111]:
def get_first_with_pattern(paragraph, patterns):
    
    for s in split_into_sentences(paragraph):
        for pattern in patterns:
            if pattern in s:
                return (s, pattern)
#         if any(substring in s for substring in patterns):
#             return s
    return None, None
#df[df["lemma_text"].str.contains("|".join(ngrams))]

In [131]:
rows = []
for abst in df["lemma_text"]:
    s, pattern = get_first_with_pattern(abst, ngrams)
    if s is not None:
        rows += [[abst, s, pattern]]
        
        

In [134]:
pd.DataFrame(data=rows, columns=["abstract", "sent", "pattern"]).to_csv("downloads/244_pattern_check.csv")
#??pd.DataFrame

In [114]:
mask1 = [[abst, a] for abst in df["lemma_text"] for a,b in get_first_with_pattern(abst, ngrams) if a is not None and b is not None]

TypeError: 'NoneType' object is not iterable

In [12]:
from re import search
mask2 = [search("|".join(ngrams), abst)  is not None for abst in df["lemma_text"] ]

In [43]:
#~df[mask1]["title"].isin(df[mask2]["title"]
len([d for d in ~df[mask2]["title"].isin(df[mask1]["title"]) if d])
diff_mask = ~df[mask2]["title"].isin(df[mask1]["title"])
#diff_mask.index

Int64Index([  11,   38,   44,   57,   61,   85,   89,   94,   99,  105,
            ...
            2838, 2845, 2857, 2859, 2883, 2887, 2914, 2927, 2930, 2942],
           dtype='int64', length=309)

In [58]:
df2 = df[mask2]
df1 = df[mask1]
text1 = df1["lemma_text"]

In [65]:
non_caught= df2.query("lemma_text not in @text1")



In [71]:
firsts = df1["lemma_text"].map(lambda t: get_first_with_pattern(t, ngrams))
#diff_mask.index
firsts

with open("downloads/non_caught.txt", "w") as f:
    f.write("\n\n".join(firsts))

TypeError: bad operand type for unary ~: 'Int64Index'

In [56]:
not_caught = df.loc[diff_mask.index]
not_caught

Unnamed: 0.1,Unnamed: 0,title,doi,abstract,journals,lemma_text
11,11,The single-payer option: a reconsideration,10.1215/03616878-2009-013,This article discusses some of the merits and ...,,this article discuss some of the merit and dem...
38,38,A systematic review of the experiences and per...,10.1016/j.nedt.2009.10.017,Aim: A systematic literature review of the exp...,,aim : a systematic literature review of the ex...
44,44,Talking with pictures: Exploring the possibili...,10.1080/0968776930010104,As multimedia computing becomes the order of t...,,as multimedia computing become the order of th...
57,57,Cost Model for Evaluation of SDMB Service over...,10.1109/ISWCS.2005.1547789,The number of broadband users has beengrowing ...,,the number of broadband user have beengrowe ra...
61,61,Renewable hydrogen utilisation for the product...,10.1016/j.enconman.2006.06.011,Electrolytic hydrogen production is an efficie...,"[{'title': None, 'identifiers': ['issn:0196-89...",electrolytic hydrogen production be an efficie...
85,85,Involving Patients and the Public in E-Health ...,10.4018/978-1-60566-016-5.ch009,This paper was published as Patient-Centered E...,,this paper be publish as Patient - center e - ...
89,89,Cognitive bias as an indicator of animal emoti...,10.1016/j.applanim.2009.02.023,Accurate assessment of animal emotion (affect)...,,accurate assessment of animal emotion ( affect...
94,94,Decision making and planning under low levels ...,10.1016/j.ijforecast.2009.05.019,In this paper we review and analyse scenario p...,"[{'title': None, 'identifiers': ['0169-2070', ...",in this paper we review and analyse scenario p...
99,99,Why equality? On justifying liberal egalitaria...,10.1080/13698230903326257,The debate over the nature of egalitarianism h...,,the debate over the nature of egalitarianism h...
105,105,A model for evaluating the institutional costs...,10.1080/0968776030110205,Significant investments are being made in the ...,,significant investment be be make in the appli...


In [14]:
hits["lemma_text"][11]

'this article discuss some of the merit and demerit of the single - payer model of health care financing , with particular reference to the English National Health Service ( NHS ) . specifically , it be argue that the main merit be that the model can directly provide universal health care coverage , thus eradicate or at least alleviate market failure and equity concern , and that it can achieve this with relatively low total health care expenditure in general and — as compare to the commercial multiple insurance model — low administrative cost in particular . a perceive demerit of the single - payer model be that it can lead to excessive health care rationing , particularly in term of waiting time , although it be argue here that long wait be probably cause by insufficient funding rather than by the single - payer model per se . moreover , rationing of one form or another occur in all health care system , and single - payer model may be the good option if the aim be to incorporate stru

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]