In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U spacy
!pip install -U scispacy
!pip install -U pandas
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz

Collecting scispacy
  Downloading scispacy-0.5.4-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m947.0 kB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.11 (from scispacy)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting conllu (from scispacy)
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m8.4 MB/s[0m eta [36

## Import libraries

In [None]:
import scispacy
import spacy
import pandas as pd
import regex as re
from spacy.pipeline import EntityRuler

In [None]:
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token
from spacy.tokens.span import Span
from spacy.lang.en import English

In [None]:
GENERAL_PATH = "/content/drive/MyDrive/gena-db-master/src/DATASET"
DATA_PATH = GENERAL_PATH + "/data"
RESULT_PATH = GENERAL_PATH + "/results"

### Source of sciSpaCy model : https://allenai.github.io/scispacy/

In [None]:
nlp_sci = spacy.load('en_core_sci_lg')
nlp_bc5cdr = spacy.load('en_ner_bc5cdr_md')
nlp_bionlp13cg = spacy.load('en_ner_bionlp13cg_md')

  _C._set_default_tensor_type(t)


In [None]:
papers = pd.read_csv(f'{DATA_PATH}/papers_raw/final_papers.csv')
papers.head(5)

Unnamed: 0,ID,Title,Abstract
0,0,Rapid onset of functional tic-like behaviours ...,Clinicians have reported an increase in functi...
1,1,Cannabis Improves Obsessive-Compulsive Disorde...,Although several lines of evidence support the...
2,2,Is Persistent Motor or Vocal Tic Disorder a Mi...,Persistent motor or vocal tic disorder (PMVT) ...
3,3,Rage attacks in Tourette Syndrome and Chronic ...,Tourette syndrome (TS) and chronic motor/vocal...
4,4,Pharmacotherapy for tics in adult patients wit...,Tourette syndrome (TS) and persistent motor/vo...


## Get the entities from each model to find out which is the most suitable

In [None]:
example_text = papers['Abstract'][1]
example_text

'Although several lines of evidence support the hypothesis of a dysregulation of serotoninergic neurotransmission in the pathophysiology of obsessive-compulsive disorder (OCD), there is also evidence for an involvement of other pathways such as the GABAergic, glutamatergic, and dopaminergic systems. Only recently, data obtained from a small number of animal studies alternatively suggested an involvement of the endocannabinoid system in the pathophysiology of OCD reporting beneficial effects in OCD-like behavior after use of substances that stimulate the endocannabinoid system. In humans, until today, only two case reports are available reporting successful treatment with dronabinol (tetrahydrocannabinol, THC), an agonist at central cannabinoid CB1 receptors, in patients with otherwise treatment refractory OCD. In addition, data obtained from a small open uncontrolled trial using the THC analogue nabilone suggest that the combination of nabilone plus exposure-based psychotherapy is more

### EN_SCI_LG

In [None]:
doc = nlp_sci(example_text)
spacy.displacy.render(doc, style='ent', jupyter=True)

### BC5_CDR

In [None]:
doc = nlp_bc5cdr(example_text)
spacy.displacy.render(doc, style='ent', jupyter=True)

### Bio_13CG

In [None]:
doc = nlp_bionlp13cg(example_text)
spacy.displacy.render(doc, style='ent', jupyter=True)

-----------

### Get New Entities

In [None]:
with open(f'{GENERAL_PATH}/results/entities/nutrition_entities.txt', 'r') as f:
    nutritions = f.readlines()

with open(f'{GENERAL_PATH}/results//entities/mental_health_entities.txt', 'r') as f:
    mental_healths = f.readlines()

with open(f'{GENERAL_PATH}/results//entities/chebi_entities.txt', 'r') as f:
    chebis = f.readlines()

print('There are', len(nutritions), 'NUTRITION entities.')
print('There are', len(mental_healths), 'MENTAL_HEALTH entities.')
print('There are', len(chebis), 'CHEBI entities.')

There are 2910 NUTRITION entities.
There are 676 MENTAL_HEALTH entities.
There are 198 CHEBI entities.


### Add New Entities to model

In [None]:
def create_pattern(label:str, word:str):
    """
    Create entity from a given label and a given keyword
    """
    tokens = word.split()
    lowers = []
    for i in range(len(tokens)):
        lower = {'LOWER': tokens[i].lower()}
        lowers.append(lower)
    pattern = [{'label': label, 'pattern': lowers}]
    return pattern

In [None]:
nlp_final = spacy.load('en_ner_bc5cdr_md')

ruler = nlp_final.add_pipe('entity_ruler', before='ner')

for nutrition in nutritions:
    pattern = create_pattern('NUTRITION', nutrition.replace('\n', ''))
    ruler.add_patterns(pattern)
    if nutrition.replace('\n', '').replace('-', ' ') != nutrition.replace('\n', ''):
        pattern = create_pattern('NUTRITION', nutrition.replace('\n', '').replace('-', ' '))
        ruler.add_patterns(pattern)

for chebi in chebis:
    pattern = create_pattern('BIOCHEMICAL', chebi.replace('\n', ''))
    ruler.add_patterns(pattern)
    if chebi.replace('\n', '').replace('-', ' ') != chebi.replace('\n', ''):
        pattern = create_pattern('BIOCHEMICAL', chebi.replace('\n', '').replace('-', ' '))
        ruler.add_patterns(pattern)

for mental in mental_healths:
    pattern = create_pattern('MENTAL_HEALTH', mental.replace('\n', ''))
    ruler.add_patterns(pattern)
    if mental.replace('\n', '').replace('-', ' ') != mental.replace('\n', ''):
        pattern = create_pattern('MENTAL_HEALTH', mental.replace('\n', '').replace('-', ' '))
        ruler.add_patterns(pattern)

In [None]:
nlp_final.to_disk(f'{GENERAL_PATH}/results/model/en_gena_sm')

--------------------------

## Filtering sentences

In [None]:
papers = pd.read_csv('/content/drive/MyDrive/gena-db-master/src/DATASET/data/papers_raw/final_papers_3.csv')
papers.head(5)

Unnamed: 0,PMID,Title,Abstract
0,28470822,Pericyte-derived bone morphogenetic protein 4 ...,Subcortical small vessel disease (SVD) is char...
1,31792039,The risk of malnutrition in children with auti...,A 9-year-old boy presented with a 2-day histor...
2,31019473,"Neurological, Psychiatric, and Biochemical Asp...",Thiamine (vitamin B1) is an essential nutrient...
3,21453474,Prenatal exposure of a girl with autism spectr...,Autism is a complex neurodevelopmental disorde...
4,34990378,[Clinical characteristics and treatment of tre...,Depression represents the predominant mood pol...


### Call the model

In [None]:
gena_nlp = spacy.load(f'{GENERAL_PATH}/results/model/en_gena_sm')
test_sentence = "hormone have role in sexual desire disorder"
doc = gena_nlp(test_sentence)
for ent in doc.ents:
    print(ent, '\t', ent.label_)

In [None]:
origin_nlp = spacy.load('en_ner_bc5cdr_md')
test_sentence = "hormone have role in sexual desire disorder"
doc = origin_nlp(test_sentence)
for ent in doc.ents:
    print(ent, '\t', ent.label_)

### Find out which sentence contains 2 types of entities Health (mental_health, disease) and Nutrition (chemical, nutrition)

In [None]:
def contain_entities(sent:Doc, entities_1:list=["MENTAL_HEALTH", "DISEASE"], entities_2:list=["CHEMICAL", "NUTRITION", "BIOCHEMICAL"])->bool:
    """
    Check if a sentence is containing entities in first entity list and the second one
    """
    contain_1 = False
    contain_2 = False
    for ent in sent.ents:
        if ent.label_ in entities_1:
            contain_1 = True
        if ent.label_ in entities_2:
            contain_2 = True
        if contain_1 and contain_2:
            return True
    return False

In [None]:
for i in range(30):
    example_text = papers['Abstract'][i]
    doc = gena_nlp(example_text.replace('-', ' '))
    print(i,')')
    for sent in doc.sents:
        text = gena_nlp(sent.text)
        if contain_entities(text):
            spacy.displacy.render(text, style='ent', jupyter=True)

0 )


1 )


2 )


3 )


4 )


5 )
6 )


7 )
8 )
9 )


10 )
11 )
12 )


13 )
14 )


15 )


16 )
17 )


18 )


19 )
20 )


21 )


22 )
23 )
24 )
25 )
26 )
27 )


28 )
29 )


### Implement for the whole papers

In [None]:
sentences = []
for idx, row in papers.iterrows():
    text = row['Abstract']
    doc = gena_nlp(text)
    print("Abstract", idx, "...")
    for sent in doc.sents:
        text = gena_nlp(sent.text)
        if contain_entities(text):
            sentences.append((row['PMID'], sent.text))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Abstract 3050 ...
Abstract 3051 ...
Abstract 3052 ...
Abstract 3053 ...
Abstract 3054 ...
Abstract 3055 ...
Abstract 3056 ...
Abstract 3057 ...
Abstract 3058 ...
Abstract 3059 ...
Abstract 3060 ...
Abstract 3061 ...
Abstract 3062 ...
Abstract 3063 ...
Abstract 3064 ...
Abstract 3065 ...
Abstract 3066 ...
Abstract 3067 ...
Abstract 3068 ...
Abstract 3069 ...
Abstract 3070 ...
Abstract 3071 ...
Abstract 3072 ...
Abstract 3073 ...
Abstract 3074 ...
Abstract 3075 ...
Abstract 3076 ...
Abstract 3077 ...
Abstract 3078 ...
Abstract 3079 ...
Abstract 3080 ...
Abstract 3081 ...
Abstract 3082 ...
Abstract 3083 ...
Abstract 3084 ...
Abstract 3085 ...
Abstract 3086 ...
Abstract 3087 ...
Abstract 3088 ...
Abstract 3089 ...
Abstract 3090 ...
Abstract 3091 ...
Abstract 3092 ...
Abstract 3093 ...
Abstract 3094 ...
Abstract 3095 ...
Abstract 3096 ...
Abstract 3097 ...
Abstract 3098 ...
Abstract 3099 ...
Abstract 3100 ...
Abstract 3101 ...

In [None]:
print(f"There are {len(sentences)} sentences that related to Disease (or Mental health) and Nutrition.")

There are 9330 sentences that related to Disease (or Mental health) and Nutrition.


### Save these sentences to file for reusing

In [None]:
sentences_df = pd.DataFrame(sentences, columns=["PMID", "Sentence"])
sentences_df.head(5)

Unnamed: 0,PMID,Sentence
0,28470822,Transforming growth factor beta 1 (TGFB1) is d...
1,28470822,We examined immunostaining of TGFB1 and BMPs (...
2,31792039,"He had autism spectrum disorder, and restricte..."
3,31792039,Laboratory results demonstrated glucose 2.7 mm...
4,31792039,QUESTION 1: What nutritional/metabolic test(s)...


In [None]:
sentences_df.to_csv(f"{DATA_PATH}/sentences/sentences_3.csv", index=False)