# Pattern Sanity Check on CORE Abstracts

In [None]:
%load_ext autoreload
%autoreload 2

## Preprocessing

In [None]:
import json
from octoflow.core import lemmatize
import pandas as pd

with open("downloads/sample_4k.json") as f:
    data = json.loads(f.read())
    
d = dict(data)
abst = [entry for entry in d["data"] if entry["abstract"]]
len(abst), abst[0]

(2956,
 {'doi': '10.1029/2006GL027886',
  'coreId': '71102',
  'oai': 'oai:eprints.lancs.ac.uk:6706',
  'identifiers': ['oai:eprints.lancs.ac.uk:6706', '10.1029/2006GL027886'],
  'title': 'Transport of plasma sheet material to the inner magnetosphere',
  'authors': ['Denton, M. H.',
   'Thomsen, M. F.',
   'Borovsky, J. E.',
   'Lavraud, B.',
   'Henderson, M. G.',
   'Skoug, R. M.',
   'Funsten, H. O.',
   'Jahn, J.M.',
   'Pollock, C. J.',
   'Weygand, J. M.'],
  'enrichments': {'references': [], 'documentType': {'type': None}},
  'contributors': [],
  'datePublished': '2007',
  'abstract': 'The reaction of the plasma sheet in response to an increase in magnetospheric convection is examined using a combination of energetic neutral atom (ENA) imaging and in situ observations. Data from the IMAGE/MENA instrument are examined in conjunction with observations from the magnetospheric plasma analyzer (MPA) instrument onboard the Los Alamos 1994-084 satellite located in geosynchronous orbit

In [None]:
with open("stem_stems.txt", "r") as f:
    stem_stems = f.read().split("\n")

In [None]:
stem_absts = [a for a in abst if any(substring in " ".join(a['topics']).lower() for substring in stem_stems)]

In [None]:
df = pd.DataFrame(stem_absts)[["title", "doi", "abstract", "journals"]] #fulltext

## CORE classifier Test

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

In [None]:
import pandas as pd 
import fastai
from fastai import *
fastai.__version__, len(df)

('2.5.3', 1081)

In [None]:
dfx = pd.DataFrame([], columns=list(df.columns) + ["text", "pred"])

In [None]:
dfx

Unnamed: 0,title,doi,abstract,journals,text,pred


In [None]:
from fastai.learner import load_learner

In [None]:
learn = load_learner("downloads/oct15_40k.pkl")
#learn.predict("Anaphylaxis is a life-threatening emergency of which reliable epidemiological data are lacking")

In [None]:
for index, row in df.iterrows():
    for sent in nlp(row.abstract).sents:
        out = learn.predict(sent.text)
        predict= int(out[0])
        conf0 = round(out[2][0].item(),2)
        conf1 = round(out[2][1].item(),2)
        dfx = dfx.append(
             {
            'title':row.title,
            'doi':row.doi,
            'abstract':row.abstract,
            "journals":row.journals,
                 
                 "text": sent.text,
            "predict": predict,
            "conf0": conf0,
            "conf1": conf1
             }, 
        
        ignore_index=True)
 #   print(dfx.append(row))
 

#[learn.predict(s.text) for s in nlp(df["abstract"][0]).sents], ["text"], columns=["pred"])

█

Unnamed: 0,title,doi,abstract,journals,text,pred,conf0,conf1,predict
0,Transport of plasma sheet material to the inner magnetosphere,10.1029/2006GL027886,"The reaction of the plasma sheet in response to an increase in magnetospheric convection is examined using a combination of energetic neutral atom (ENA) imaging and in situ observations. Data from the IMAGE/MENA instrument are examined in conjunction with observations from the magnetospheric plasma analyzer (MPA) instrument onboard the Los Alamos 1994-084 satellite located in geosynchronous orbit. Examination of the MENA data during an enhanced convection event reveal that between 12:00 and 14:30 UT on 26 June 2001, ENA emissions from the plasma sheet material are observed to strengthen an...",,The reaction of the plasma sheet in response to an increase in magnetospheric convection is examined using a combination of energetic neutral atom (ENA) imaging and in situ observations.,,1.00,0.00,0.0
1,Transport of plasma sheet material to the inner magnetosphere,10.1029/2006GL027886,"The reaction of the plasma sheet in response to an increase in magnetospheric convection is examined using a combination of energetic neutral atom (ENA) imaging and in situ observations. Data from the IMAGE/MENA instrument are examined in conjunction with observations from the magnetospheric plasma analyzer (MPA) instrument onboard the Los Alamos 1994-084 satellite located in geosynchronous orbit. Examination of the MENA data during an enhanced convection event reveal that between 12:00 and 14:30 UT on 26 June 2001, ENA emissions from the plasma sheet material are observed to strengthen an...",,Data from the IMAGE/MENA instrument are examined in conjunction with observations from the magnetospheric plasma analyzer (MPA) instrument onboard the Los Alamos 1994-084 satellite located in geosynchronous orbit.,,1.00,0.00,0.0
2,Transport of plasma sheet material to the inner magnetosphere,10.1029/2006GL027886,"The reaction of the plasma sheet in response to an increase in magnetospheric convection is examined using a combination of energetic neutral atom (ENA) imaging and in situ observations. Data from the IMAGE/MENA instrument are examined in conjunction with observations from the magnetospheric plasma analyzer (MPA) instrument onboard the Los Alamos 1994-084 satellite located in geosynchronous orbit. Examination of the MENA data during an enhanced convection event reveal that between 12:00 and 14:30 UT on 26 June 2001, ENA emissions from the plasma sheet material are observed to strengthen an...",,"Examination of the MENA data during an enhanced convection event reveal that between 12:00 and 14:30 UT on 26 June 2001, ENA emissions from the plasma sheet material are observed to strengthen and move Earthwards.",,1.00,0.00,0.0
3,Transport of plasma sheet material to the inner magnetosphere,10.1029/2006GL027886,"The reaction of the plasma sheet in response to an increase in magnetospheric convection is examined using a combination of energetic neutral atom (ENA) imaging and in situ observations. Data from the IMAGE/MENA instrument are examined in conjunction with observations from the magnetospheric plasma analyzer (MPA) instrument onboard the Los Alamos 1994-084 satellite located in geosynchronous orbit. Examination of the MENA data during an enhanced convection event reveal that between 12:00 and 14:30 UT on 26 June 2001, ENA emissions from the plasma sheet material are observed to strengthen an...",,A simple calculation of the motion of the peak in ENA emissions following an increase in the convection gives an averaged speed of this sunward surge of around 8 km s−1 between 12:00 and 14:30 UT,,0.99,0.01,0.0
4,Eddy current measurements of electrical conductivity and magnetic permeability of porous metals.,10.1016/j.ndteint.2006.03.008,"This paper presents a method, which simultaneously estimates the electrical conductivity and magnetic permeability of porous metals. Porous Cu and Fe manufactured by the lost carbonate sintering process have been tested. An air-cored solenoid coil was designed for the measurements of rod-shaped samples when inserted coaxially with the coil. It was theoretically found that the phase-frequency response of the normalised eddy current signal of the coil is virtually independent of the radius, electrical conductivity and magnetic permeability of the test samples. For non-magnetic, conductive po...",,"This paper presents a method, which simultaneously estimates the electrical conductivity and magnetic permeability of porous metals.",,1.00,0.00,0.0
5,Eddy current measurements of electrical conductivity and magnetic permeability of porous metals.,10.1016/j.ndteint.2006.03.008,"This paper presents a method, which simultaneously estimates the electrical conductivity and magnetic permeability of porous metals. Porous Cu and Fe manufactured by the lost carbonate sintering process have been tested. An air-cored solenoid coil was designed for the measurements of rod-shaped samples when inserted coaxially with the coil. It was theoretically found that the phase-frequency response of the normalised eddy current signal of the coil is virtually independent of the radius, electrical conductivity and magnetic permeability of the test samples. For non-magnetic, conductive po...",,Porous Cu and Fe manufactured by the lost carbonate sintering process have been tested.,,1.00,0.00,0.0
6,Eddy current measurements of electrical conductivity and magnetic permeability of porous metals.,10.1016/j.ndteint.2006.03.008,"This paper presents a method, which simultaneously estimates the electrical conductivity and magnetic permeability of porous metals. Porous Cu and Fe manufactured by the lost carbonate sintering process have been tested. An air-cored solenoid coil was designed for the measurements of rod-shaped samples when inserted coaxially with the coil. It was theoretically found that the phase-frequency response of the normalised eddy current signal of the coil is virtually independent of the radius, electrical conductivity and magnetic permeability of the test samples. For non-magnetic, conductive po...",,An air-cored solenoid coil was designed for the measurements of rod-shaped samples when inserted coaxially with the coil.,,1.00,0.00,0.0
7,Eddy current measurements of electrical conductivity and magnetic permeability of porous metals.,10.1016/j.ndteint.2006.03.008,"This paper presents a method, which simultaneously estimates the electrical conductivity and magnetic permeability of porous metals. Porous Cu and Fe manufactured by the lost carbonate sintering process have been tested. An air-cored solenoid coil was designed for the measurements of rod-shaped samples when inserted coaxially with the coil. It was theoretically found that the phase-frequency response of the normalised eddy current signal of the coil is virtually independent of the radius, electrical conductivity and magnetic permeability of the test samples. For non-magnetic, conductive po...",,"It was theoretically found that the phase-frequency response of the normalised eddy current signal of the coil is virtually independent of the radius, electrical conductivity and magnetic permeability of the test samples.",,1.00,0.00,0.0
8,Eddy current measurements of electrical conductivity and magnetic permeability of porous metals.,10.1016/j.ndteint.2006.03.008,"This paper presents a method, which simultaneously estimates the electrical conductivity and magnetic permeability of porous metals. Porous Cu and Fe manufactured by the lost carbonate sintering process have been tested. An air-cored solenoid coil was designed for the measurements of rod-shaped samples when inserted coaxially with the coil. It was theoretically found that the phase-frequency response of the normalised eddy current signal of the coil is virtually independent of the radius, electrical conductivity and magnetic permeability of the test samples. For non-magnetic, conductive po...",,"For non-magnetic, conductive porous Cu, the electrical conductivity was measured with a calibration curve of the coil relating the impedance change and the electrical conductivity of the sample.",,1.00,0.00,0.0
9,Eddy current measurements of electrical conductivity and magnetic permeability of porous metals.,10.1016/j.ndteint.2006.03.008,"This paper presents a method, which simultaneously estimates the electrical conductivity and magnetic permeability of porous metals. Porous Cu and Fe manufactured by the lost carbonate sintering process have been tested. An air-cored solenoid coil was designed for the measurements of rod-shaped samples when inserted coaxially with the coil. It was theoretically found that the phase-frequency response of the normalised eddy current signal of the coil is virtually independent of the radius, electrical conductivity and magnetic permeability of the test samples. For non-magnetic, conductive po...",,"For magnetic porous Fe, the imaginary part of the signal at the lowest frequencies can be used to estimate the permeability.",,1.00,0.00,0.0


In [None]:

dfx[dfx["predict"] == 1]

Unnamed: 0,title,doi,abstract,journals,text,pred,conf0,conf1,predict
32,The single-payer option: a reconsideration,10.1215/03616878-2009-013,"This article discusses some of the merits and demerits of the single-payer model of health care financing, with particular reference to the English National Health Service (NHS). Specifically, it is argued that the main merits are that the model can directly provide universal health care coverage, thus eradicating or at least alleviating market failure and equity concerns, and that it can achieve this with relatively low total health care expenditure in general and—as compared to the commercial multiple insurance model—low administrative costs in particular. A perceived demerit of the sing...",,"A perceived demerit of the single-payer model is that it can lead to excessive health care rationing, particularly in terms of waiting times, although it is argued here that long waits are probably caused by insufficient funding rather than by the single-payer model per se.",,0.31,0.69,1.0
34,The single-payer option: a reconsideration,10.1215/03616878-2009-013,"This article discusses some of the merits and demerits of the single-payer model of health care financing, with particular reference to the English National Health Service (NHS). Specifically, it is argued that the main merits are that the model can directly provide universal health care coverage, thus eradicating or at least alleviating market failure and equity concerns, and that it can achieve this with relatively low total health care expenditure in general and—as compared to the commercial multiple insurance model—low administrative costs in particular. A perceived demerit of the sing...",,"A further perceived disadvantage of the single-payer model is that it offers limited choice, which is necessarily true with respect to choice of insurer, but choice of provider can be, and increasingly is, a feature of centrally tax-financed health care systems.",,0.01,0.99,1.0
44,Representations of HIV/AIDS management in South African newspapers,10.2989/AJAR.2008.7.2.5.522,"In South Africa, numerous strong policy statements emphasise the importance of involving communities in HIV/AIDS management, yet in practice such involvement tends to be tokenistic and minimal. Social representations in the public sphere constitute the symbolic dimension within which responses to HIV and AIDS are conceptualised and transformed into action. Through an analysis of newspaper articles, we explore the dominant representations of HIV/AIDS management circulating in the South African public sphere and examine how community engagement is depicted. We highlight the way media represe...",,"We highlight the way media representations reflect narrow understandings of HIV and AIDS as a predominantly medical problem, while depicting HIV/AIDS management as a top-down activity dominated by prominent individuals, such as national leaders, health professionals and philanthropists, thus marginalising the role played by communities, who are often depicted as passive recipients of interventions by active outsiders.",,0.06,0.94,1.0
103,Linear tracks and restricted temperature ranges characterise penguin foraging pathways,10.3354/meps07638,"Marine predators are thought to follow sophisticated scale-dependent search strategies when seeking patchy and unpredictable prey. However, fine-scale information about these strategies has hitherto been difficult to obtain for diving predators that often remain at the sea surface for only limited periods of time. Using ARGOS telemetry and novel, low-powered, archival GPS, we followed the fine-scale at-sea behaviour of king penguins breeding on South Georgia. Results revealed that foraging pathways were generally linear, except at the finest scale, where movements probably reflected either...",,"However, fine-scale information about these strategies has hitherto been difficult to obtain for diving predators that often remain at the sea surface for only limited periods of time.",,0.22,0.78,1.0
168,A theory of requisite decision models,10.1016/0001-6918(84)90005-2,"A requisite decision model is defined as a model whose form and content are sufficient to solve a particular problem. The model is constructed through an interactive and consultative process between problem owners and specialists (decision analysts). The process of generating the model uses participants' sense of unease about current model results to further development of the model. Sensitivity analyses facilitate the emergence of new intuitions about the problem; when no new intuitions arise, the model is considered requisite. At all stages of development, the model represents the social...",,A requisite decision model is defined as a model whose form and content are sufficient to solve a particular problem.,,0.07,0.93,1.0
171,A theory of requisite decision models,10.1016/0001-6918(84)90005-2,"A requisite decision model is defined as a model whose form and content are sufficient to solve a particular problem. The model is constructed through an interactive and consultative process between problem owners and specialists (decision analysts). The process of generating the model uses participants' sense of unease about current model results to further development of the model. Sensitivity analyses facilitate the emergence of new intuitions about the problem; when no new intuitions arise, the model is considered requisite. At all stages of development, the model represents the social...",,"Sensitivity analyses facilitate the emergence of new intuitions about the problem; when no new intuitions arise, the model is considered requisite.",,0.10,0.90,1.0
172,A theory of requisite decision models,10.1016/0001-6918(84)90005-2,"A requisite decision model is defined as a model whose form and content are sufficient to solve a particular problem. The model is constructed through an interactive and consultative process between problem owners and specialists (decision analysts). The process of generating the model uses participants' sense of unease about current model results to further development of the model. Sensitivity analyses facilitate the emergence of new intuitions about the problem; when no new intuitions arise, the model is considered requisite. At all stages of development, the model represents the social...",,"At all stages of development, the model represents the social reality of the shared understanding of the problem by the problem owners.",,0.11,0.89,1.0
296,Dynamic imaging in electrical capacitance tomography and electromagnetic induction tomography using a Kalman filter.,10.1088/0957-0233,"Electrical capacitance tomography (ECT) and electromagnetic induction tomography (EMT) attempt to visualize the distributions of materials with different permittivity and conductivity/permeability, aiming to reveal electrical and magnetic characteristics of an object, by measuring electrical capacitance and electromagnetic inductance on the periphery of the object. In ECT, capacitances of pairs of electrodes placed around the periphery are measured and in EMT, mutual induction of pairs of coils is measured. In this paper, a dynamic imaging technique is developed for ECT and EMT with a line...",,The inverse problem is treated as a state estimate.,,0.00,1.00,1.0
333,Magnetic biomonitoring of roadside tree leaves: identification of spatial and temporal variations in vehicle-derived particulates.,10.1016/S1352-2310(99)00229-0,"We report here the novel use of rapid and non-destructive magnetic measurements to investigate the spatial and temporal pattern of urban dust loadings on leaves of roadside trees. More than 600 leaves were collected from birch trees and their remanent magnetization (IRM300 mT) determined and normalized for the leaf area. The results show that this normalised 2-D magnetization is dominantly controlled by the tree's distance to the road. The magnetic analyses enabled detailed mapping of the spatial and temporal variations of vehicle-derived particulates. Higher 2D-magnetizations, indicating ...",,"Additional magnetic analyses suggest that the particle size of the magnetic grains dominantly falls in the range classified for airborne particulate matter as PM2.5 (<2.5 Âµm), a particle size hazardous to health due to its capacity to be respired deeply into the lungs.",,0.40,0.60,1.0
345,Some of our concepts are missing: reflections on the absence of a sociology of organisations,10.1111/1467-9566.00346,The task of examining just how the concept of 'organisations' has fared in Sociology of Health and Illness in its first 25 years is in some ways unrewarding. The answer has to be –'not at all well'. But why is this and does it matter? Part one of this paper considers what research on health care organisations was being conducted in the early years of the Journal and why that work was not viewed with favour by sociologists. Part two examines the growing gulf between those who saw themselves principally as responding to the call for a sociology of health and illness informed by broader socio...,,But why is this and does it matter?,,0.07,0.93,1.0


dfx.to_csv("downloads/stem_core_predicts.csv")

## Check Matching Sentence Quality

In [None]:
df["lemma_text"] = df["abstract"].map(lemmatize)

In [None]:
df.to_csv("downloads/core_3k_lemma_abst_only.csv")

In [None]:
from octoflow.core import lemmatize, find_sentence_in_abstract, split_into_sentences, sentence_has_phrase
import pandas as pd

df = pd.read_csv("downloads/core_3k_lemma_abst_only.csv")
with open("ngrams_pos_only_lemmatized.txt") as f:
    ngrams = f.read().split("\n")
    
# trigrams = [gram for gram in ngrams if len(gram.split(" "))> 2]
# bigrams = [gram for gram in ngrams if len(gram.split(" ")) == 2]

In [None]:
hits = df[df["lemma_text"].str.contains("|".join(ngrams))]

In [None]:
hits["lemma_text"]

11      this article discuss some of the merit and dem...
38      aim : a systematic literature review of the ex...
44      as multimedia computing become the order of th...
57      the number of broadband user have beengrowe ra...
61      electrolytic hydrogen production be an efficie...
85      this paper be publish as Patient - center e - ...
89      accurate assessment of animal emotion ( affect...
94      in this paper we review and analyse scenario p...
99      the debate over the nature of egalitarianism h...
105     significant investment be be make in the appli...
113     about the book : this collection of paper aim ...
118     in Spain , a grow body of literature have draw...
139     this paper investigate how specific notion of ...
151     a simple sensor method be develop for aflatoxi...
161     < p > image segmentation be an indispensable p...
169     concern over the impact of debt on participati...
173     my research project revolve around the problem...
185     this p

In [None]:
hits.to_csv("downloads/hits.csv")

In [None]:
from pydash import find_index

def find_sentence_in_abstract(paragraph, bias=0):
    """checks paragraph for key sentences. Returns first matching hit"""
    p_sents = split_into_sentences(paragraph)
    idx = find_index(p_sents, sentence_has_phrase)
    if idx < 0:
        return ""
    
    i = idx + bias #bias- for sentence before or after
    if i < 0:
        return ""
    return get(p_sents, i)

In [None]:
def get_first_with_pattern(paragraph, patterns):
    
    for s in split_into_sentences(paragraph):
        for pattern in patterns:
            if pattern in s:
                return (s, pattern)
#         if any(substring in s for substring in patterns):
#             return s
    return None, None
#df[df["lemma_text"].str.contains("|".join(ngrams))]

In [None]:
rows = []
for abst in df["lemma_text"]:
    s, pattern = get_first_with_pattern(abst, ngrams)
    if s is not None:
        rows += [[abst, s, pattern]]
        
        

In [None]:
pd.DataFrame(data=rows, columns=["abstract", "sent", "pattern"]).to_csv("downloads/244_pattern_check.csv")
#??pd.DataFrame

In [None]:
mask1 = [[abst, a] for abst in df["lemma_text"] for a,b in get_first_with_pattern(abst, ngrams) if a is not None and b is not None]

TypeError: 'NoneType' object is not iterable

In [None]:
from re import search
mask2 = [search("|".join(ngrams), abst)  is not None for abst in df["lemma_text"] ]

In [None]:
#~df[mask1]["title"].isin(df[mask2]["title"]
len([d for d in ~df[mask2]["title"].isin(df[mask1]["title"]) if d])
diff_mask = ~df[mask2]["title"].isin(df[mask1]["title"])
#diff_mask.index

Int64Index([  11,   38,   44,   57,   61,   85,   89,   94,   99,  105,
            ...
            2838, 2845, 2857, 2859, 2883, 2887, 2914, 2927, 2930, 2942],
           dtype='int64', length=309)

In [None]:
df2 = df[mask2]
df1 = df[mask1]
text1 = df1["lemma_text"]

In [None]:
non_caught= df2.query("lemma_text not in @text1")



In [None]:
firsts = df1["lemma_text"].map(lambda t: get_first_with_pattern(t, ngrams))
#diff_mask.index
firsts

with open("downloads/non_caught.txt", "w") as f:
    f.write("\n\n".join(firsts))

TypeError: bad operand type for unary ~: 'Int64Index'

In [None]:
not_caught = df.loc[diff_mask.index]
not_caught

Unnamed: 0.1,Unnamed: 0,title,doi,abstract,journals,lemma_text
11,11,The single-payer option: a reconsideration,10.1215/03616878-2009-013,This article discusses some of the merits and ...,,this article discuss some of the merit and dem...
38,38,A systematic review of the experiences and per...,10.1016/j.nedt.2009.10.017,Aim: A systematic literature review of the exp...,,aim : a systematic literature review of the ex...
44,44,Talking with pictures: Exploring the possibili...,10.1080/0968776930010104,As multimedia computing becomes the order of t...,,as multimedia computing become the order of th...
57,57,Cost Model for Evaluation of SDMB Service over...,10.1109/ISWCS.2005.1547789,The number of broadband users has beengrowing ...,,the number of broadband user have beengrowe ra...
61,61,Renewable hydrogen utilisation for the product...,10.1016/j.enconman.2006.06.011,Electrolytic hydrogen production is an efficie...,"[{'title': None, 'identifiers': ['issn:0196-89...",electrolytic hydrogen production be an efficie...
85,85,Involving Patients and the Public in E-Health ...,10.4018/978-1-60566-016-5.ch009,This paper was published as Patient-Centered E...,,this paper be publish as Patient - center e - ...
89,89,Cognitive bias as an indicator of animal emoti...,10.1016/j.applanim.2009.02.023,Accurate assessment of animal emotion (affect)...,,accurate assessment of animal emotion ( affect...
94,94,Decision making and planning under low levels ...,10.1016/j.ijforecast.2009.05.019,In this paper we review and analyse scenario p...,"[{'title': None, 'identifiers': ['0169-2070', ...",in this paper we review and analyse scenario p...
99,99,Why equality? On justifying liberal egalitaria...,10.1080/13698230903326257,The debate over the nature of egalitarianism h...,,the debate over the nature of egalitarianism h...
105,105,A model for evaluating the institutional costs...,10.1080/0968776030110205,Significant investments are being made in the ...,,significant investment be be make in the appli...


In [None]:
hits["lemma_text"][11]

'this article discuss some of the merit and demerit of the single - payer model of health care financing , with particular reference to the English National Health Service ( NHS ) . specifically , it be argue that the main merit be that the model can directly provide universal health care coverage , thus eradicate or at least alleviate market failure and equity concern , and that it can achieve this with relatively low total health care expenditure in general and — as compare to the commercial multiple insurance model — low administrative cost in particular . a perceive demerit of the single - payer model be that it can lead to excessive health care rationing , particularly in term of waiting time , although it be argue here that long wait be probably cause by insufficient funding rather than by the single - payer model per se . moreover , rationing of one form or another occur in all health care system , and single - payer model may be the good option if the aim be to incorporate stru

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]

In [None]:
candidates = [entry for entry in abst if any(substring in entry["abstract"] for substring in ngrams)]