# Analysis of the spacy splitter used for the pipeline

In [1]:
# Imports
from IPython.display import display, HTML, display_html
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm

spacy's **"en_core_sci_md"** is used in the pipeline to split the abstracts in sentences. Here, I will analyse how well it is able to separate sentences. I must check 2 things: how well can it **separate sentences**, and **keep sentences together**. How will I do both: 
* **Separate sentences:** <span style='color: red'>Think on how to do it, and whether it is relevant</span>
* **Keep sentences together:** Use the tool to divide the sentences present in the TRI dataset, and check how many it incorrectly further splits into multiple sentences

### Ability to keep sentences together

As shown below, nlp leaves most of TRI sentences together, although it does incorrectly separate certain ones. I have identified 2 main mistakes. It sometimes separates a sentence when: 
* There is no punctuation mark
* The punctuation mark is followed by a lowercase letter

In both cases, the model supposes a human typo and separates the sentence. Considering that the data used for training are PubMed abstracts, scientific and with grammar rigor, we will consider that those human mistakes are negligible, and therefore, after the splitting, we will **merge together all the sentences that are not splitted by a "." followed by a capital letter**. 

The code and results are shown below:

In [2]:
TRI_data = pd.read_csv('../../data/TRI_data.tsv', sep='\t')

In [11]:
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz
nlp = spacy.load("en_core_sci_md")
def merge_sentences(doc):
    merged_sentences = []
    temp_sentence = ""

    sentences = [i for i in doc.sents]  # Convert generator to a list for easier handling
    for i, sentence in enumerate(sentences):
        current_text = sentence.text.strip()
        temp_sentence += current_text + " "

        # Separate to a new sentence if: 
        #     it is the last sentence, or
        #     The sentence ends with ". " and next sentence starts with uppercase
        if i == len(sentences) - 1 or ( current_text.endswith(('.', '!', '?')) 
                                       and sentences[i + 1].text.strip()[0].isupper()
                                       and sentences[i + 1].start_char == sentence.end_char + 1
                                      ):
            merged_sentences.append(temp_sentence.strip())
            temp_sentence = ""
    return merged_sentences

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [12]:
TRI_data['sent_ID'] = TRI_data['#TRI ID'].str.split(':').str[:2].str.join(':')
sentences = []
splitted_sentences = {}

# Use spaCy's pipe for efficient batch processing if applicable (batched reduces time from 5min to 1min)
texts = TRI_data['texts'].tolist()  # Convert column to list for efficient processing
%time docs = list(nlp.pipe(texts))

total = len(TRI_data)
for doc, id, label in tqdm(zip(docs, TRI_data['sent_ID'], TRI_data['labels']), total=total):
    merged_sents = merge_sentences(doc)
    sentences.extend(merged_sents)
    
    if len(merged_sents) == 0:
        print("WARNING")
        print(text)
        print('_')
    if len(merged_sents) > 1:
        if id not in splitted_sentences:
            splitted_sentences[id] = [[merged_sents, label]]
        else:
            splitted_sentences[id].append([merged_sents, label])

CPU times: user 54.5 s, sys: 4.38 s, total: 58.9 s
Wall time: 59.4 s


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 22135/22135 [00:00<00:00, 63768.05it/s]


In [103]:
unique_sentence_IDs = TRI_data.drop_duplicates(subset='sent_ID').set_index('sent_ID')
doc_sents = [i for doc in docs for i in doc.sents]

display(HTML(f'''
<table>
    <tr><td>Original length</td><td>{len(TRI_data)}</td></tr>
    <tr><td>After spacy</td><td>{len(doc_sents)}</td></tr>
    <tr><td>After <code>merge_sentences()</code></td><td>{len(sentences)}</td></tr>
</table>

With {len(TRI_data)} original sentences, spacy further splits the rows into {len(doc_sents) - len(TRI_data['texts'])} new sentences.<br>
However, <code>merge_function()</code> merges {len(doc_sents) - len(sentences)} of those sentences, leaving {len(sentences) - len(TRI_data['texts'])} 
new sentences, that come from {len(splitted_sentences)} unique sentence IDs.<br>
Therefore, a <b>{len(splitted_sentences)/len(unique_sentence_IDs)*100:.2f}% of unique sentences are affected</b><br>
'''))

0,1
Original length,22135
After spacy,22235
After merge_sentences(),22208


All positively labeled rows that have been splitted are shown below. They are divided in 3 categories: 
* TF and TG in different sentences
* TF and TG in the same sentence
* Both cases (depending on what TF and TG has been chosen)

In [132]:
# Save the ids from splitted sentences with label = 1
label_1_splitted_ids = []
for id, sents in splitted_sentences.items():
    for sent_list in sents:
        if sent_list[1] == 1:
            label_1_splitted_ids.append(id)

print(len(TRI_data[TRI_data['sent_ID'].isin(label_1_splitted_ids)]))

# Function to replace occurrences of a word with the word wrapped in <strong> tags
def highlight_word(text):
    for word in ['[TF]', '[TG]']:
        text = text.replace(word, f'<strong>{word}</strong>')
    text = text.replace('.', '.<br>')    
    return text

# Sentences have been divided into 3 categories:
TF_TG_Together  = ['11970950:3',  '11991951:0', '21400515:8', '21421043:10', '8276360:3','9837963:0']
both            = ['22911891:8']
TF_TG_separated = ['7822333:0', '10373481:0', '11274184:0', '11038350:0', '11278891:0',
                   '1281483:0', '9786935:0', '9857074:0', '12399468:0']

pd.set_option('display.max_colwidth', 1000)
for list_ids, title in zip([TF_TG_separated, TF_TG_Together, both], 
                       ['TF and TG in different sentences', 'TF and TG in the same sentence', 'Both casuistics']):
    df = TRI_data[(TRI_data['sent_ID'].isin(list_ids)) & (TRI_data['labels'] == 1)][['#TRI ID', 'Sentence']]
    df['Sentence'] = df['Sentence'].apply(lambda x: highlight_word(x))
    html_df = df.to_html(escape=False)
    
    display(HTML(f'<strong>{title}</strong>'))
    display(HTML(html_df))

29


Unnamed: 0,#TRI ID,Sentence
6489,7822333:0:RELA:ICAM1,Transcriptional regulation of the [TG] gene by inflammatory cytokines in human endothelial cells.  Essential roles of a variant NF-kappa B site and [TF] homodimers.
7017,10373481:0:STAT5A:PRLR,Regulation of [TG] receptor (PRLR) gene expression in insulin-producing cells.  Prolactin and growth hormone activate one of the rat prlr gene promoters via [TF] and STAT5b.
7018,10373481:0:STAT5B:PRLR,Regulation of [TG] receptor (PRLR) gene expression in insulin-producing cells.  Prolactin and growth hormone activate one of the rat prlr gene promoters via STAT5a and [TF].
7184,11274184:0:ATF4:HMOX1,Identification of [TF] (ATF4) as an Nrf2-interacting protein.  Implication for [TG] gene regulation.
7417,11038350:0:YY1:GAA,Transcriptional regulation of the [TG] gene.  Identification of a repressor element and its transcription factors Hes-1 and [TF].
7494,11274184:0:NFE2L2:HMOX1,Identification of activating transcription factor 4 (ATF4) as an [TF]-interacting protein.  Implication for [TG] gene regulation.
7506,11278891:0:GATA2:EDN1,"Molecular regulation of the [TG] gene by hypoxia.  Contributions of hypoxia-inducible factor-1, activator protein-1, [TF], AND p300/CBP."
8012,1281483:0:AP1:ELN,Tumor necrosis factor-alpha down-regulates human [TG] gene expression.  Evidence for the role of [TF] in the suppression of promoter activity.
12778,9786935:0:SP3:BGN,[TG] gene expression in the human leiomyosarcoma cell line SK-UT-1.  Basal and protein kinase A-induced transcription involves binding of Sp1-like/[TF] proteins in the proximal promoter region.
12796,9857074:0:AP1:GZMB,Down-regulation of human [TG] expression by glucocorticoids.  Dexamethasone inhibits binding to the Ikaros and [TF] regulatory elements of the granzyme B promoter.


Unnamed: 0,#TRI ID,Sentence
6352,11970950:3:SP1:FASLG,"We recently described a novel mechanism mediating inducible [TG] gene expression in smooth muscle cells involving the zinc finger transcription factor [TF] (Kavurma, M.  M. , Santiago, F.  S. , Bofocco, E. , and Khachigian, L.  M.  (2001) J.  Biol.  Chem.  276, 4964-4971)."
7726,11991951:0:ETS1:FLI1,[TF] regulates [TG] expression in endothelial cells.  Identification of ETS binding sites in the fli-1 gene promoter.
11037,21400515:8:MYC:CDKN1A,"Moreover, overexpression of THAP11 significantly decreased the colony numbers, and also inhibited the expression of [TF] target genes such as Cyclin D1, ODC and induced the expression of p21([TG]) .  The depletion of THAP11 inhibited JAK2 or STAT5 inactivation-mediated c-Myc reduction in ALDH(hi) /CD34(+) CML cells."
11038,21400515:8:THAP11:CCND1,"Moreover, overexpression of [TF] significantly decreased the colony numbers, and also inhibited the expression of c-myc target genes such as [TG], ODC and induced the expression of p21(Cip1) .  The depletion of THAP11 inhibited JAK2 or STAT5 inactivation-mediated c-Myc reduction in ALDH(hi) /CD34(+) CML cells."
11039,21400515:8:THAP11:CDKN1A,"Moreover, overexpression of [TF] significantly decreased the colony numbers, and also inhibited the expression of c-myc target genes such as Cyclin D1, ODC and induced the expression of p21([TG]) .  The depletion of THAP11 inhibited JAK2 or STAT5 inactivation-mediated c-Myc reduction in ALDH(hi) /CD34(+) CML cells."
11051,21421043:10:ATF3:TREM1,"5.  We found using binding site prediction and ChIP assays that the TFs EGR3 and [TF] indeed bound to the [TG] promoter, PU. 1 bound to both the TREM-1 and DAP12 promoter."
12514,8276360:3:AP1:MYC,"In this study, we investigated the mechanism of target gene stimulation by preS2/St.  It was found that deletion of a fragment containing the binding site for transcription factor [TF] (Jun-Fos) substantially decreases inducibility of the human [TG] promoter by preS2/St.  A subsequent investigation of AP-1 activation by preS2/St revealed the following: (a) insertion of multimeric AP-1 binding sites confers inducibility to an otherwise unstimulatable test promoter; (b) transactivation of AP-1 sites is dramatically increased when Jun and Fos are overexpressed by cotransfected expression plasmids; and (c) inhibitors of AP-1 activation also impair transactivation by preS2/St.  Besides AP-1, preS2/St was also able to utilize the unrelated transcription factors NF-kappa B and AP-2 for transactivation, suggesting that the gene product of preS2/St acts indirectly through one or several general cellular pathways rather than as a bona fide transcription factor."
12515,8276360:3:FOS:MYC,"In this study, we investigated the mechanism of target gene stimulation by preS2/St.  It was found that deletion of a fragment containing the binding site for transcription factor [TF] (Jun-Fos) substantially decreases inducibility of the human [TG] promoter by preS2/St.  A subsequent investigation of AP-1 activation by preS2/St revealed the following: (a) insertion of multimeric AP-1 binding sites confers inducibility to an otherwise unstimulatable test promoter; (b) transactivation of AP-1 sites is dramatically increased when Jun and Fos are overexpressed by cotransfected expression plasmids; and (c) inhibitors of AP-1 activation also impair transactivation by preS2/St.  Besides AP-1, preS2/St was also able to utilize the unrelated transcription factors NF-kappa B and AP-2 for transactivation, suggesting that the gene product of preS2/St acts indirectly through one or several general cellular pathways rather than as a bona fide transcription factor."
12794,9837963:0:KLF6:TGFB1,Transcriptional activation of [TG] and its receptors by the Kruppel-like factor [TF]/core promoter-binding protein and Sp1.  Potential mechanisms for autocrine fibrogenesis in response to injury.
15102,9837963:0:SP1:TGFB1,Transcriptional activation of [TG] and its receptors by the Kruppel-like factor Zf9/core promoter-binding protein and [TF].  Potential mechanisms for autocrine fibrogenesis in response to injury.


Unnamed: 0,#TRI ID,Sentence
6108,22911891:8:NFE2L2:PTGS2,"[TF]KO decreased the protein expression of antioxidant enzyme NQO1 in Apc(min/+) .  In contrast, Nrf2KO enhanced the expression of inflammatory markers such as [TG], cPLA, LTB4 in Apc(min/+) .  Finally, Nrf2KO resulted in higher level of PCNA and c-Myc expression in intestinal tissue, indicating the deficiency of Nrf2 promotes proliferation of intestinal crypt cells in Apc(min/+) .  Taken together, our results suggest that Nrf2KO attenuates anti-oxidative stress pathway, induces inflammation, and increases proliferative potential in the intestinal crypts leading to enhanced intestinal carcinogenesis and adenomas in Apc(min/+) ."
11732,22911891:8:NFE2L2:MYC,"[TF]KO decreased the protein expression of antioxidant enzyme NQO1 in Apc(min/+) .  In contrast, Nrf2KO enhanced the expression of inflammatory markers such as COX-2, cPLA, LTB4 in Apc(min/+) .  Finally, Nrf2KO resulted in higher level of PCNA and [TG] expression in intestinal tissue, indicating the deficiency of Nrf2 promotes proliferation of intestinal crypt cells in Apc(min/+) .  Taken together, our results suggest that Nrf2KO attenuates anti-oxidative stress pathway, induces inflammation, and increases proliferative potential in the intestinal crypts leading to enhanced intestinal carcinogenesis and adenomas in Apc(min/+) ."
11733,22911891:8:NFE2L2:PCNA,"[TF]KO decreased the protein expression of antioxidant enzyme NQO1 in Apc(min/+) .  In contrast, Nrf2KO enhanced the expression of inflammatory markers such as COX-2, cPLA, LTB4 in Apc(min/+) .  Finally, Nrf2KO resulted in higher level of [TG] and c-Myc expression in intestinal tissue, indicating the deficiency of Nrf2 promotes proliferation of intestinal crypt cells in Apc(min/+) .  Taken together, our results suggest that Nrf2KO attenuates anti-oxidative stress pathway, induces inflammation, and increases proliferative potential in the intestinal crypts leading to enhanced intestinal carcinogenesis and adenomas in Apc(min/+) ."
15520,22911891:8:NFE2L2:NQO1,"[TF]KO decreased the protein expression of antioxidant enzyme [TG] in Apc(min/+) .  In contrast, Nrf2KO enhanced the expression of inflammatory markers such as COX-2, cPLA, LTB4 in Apc(min/+) .  Finally, Nrf2KO resulted in higher level of PCNA and c-Myc expression in intestinal tissue, indicating the deficiency of Nrf2 promotes proliferation of intestinal crypt cells in Apc(min/+) .  Taken together, our results suggest that Nrf2KO attenuates anti-oxidative stress pathway, induces inflammation, and increases proliferative potential in the intestinal crypts leading to enhanced intestinal carcinogenesis and adenomas in Apc(min/+) ."


### Ability to separate sentences

**Playing around with random sentences**

Trying out some sentences to see how does it behave in different scenarios

In [136]:
# Alternative sentences
incorrect = nlp("The concentration of MgSO4 was maintained at 0.5 mM. In fig. 3, we illustrate the cell's response "
                "to the treatment. Refer to Table 1. for a summary of patient demographics. Further details can be found "
                "in Johnson et al., J. Med. Chem., 2020. The chemical formula for water is H2O. Please refer to sec. 3.2 "
                "of the document for more details. The temperature dropped to -10 deg. Celsius.") 
doc = nlp("The R2 value was found to be 0.85, indicating a strong correlation. The patient was diagnosed with COPD (Chronic Obstructive Pulmonary Disease). These findings were"
          "similar to those reported by Smith et al. in their 2018 study. The standard treatment protocol includes 5-FU, followed by radiation therapy. The MRI scan was"
          "performed at 1.5 T. Dr. Smith reviewed the report before the meeting. Mrs. Johnson will be attending the conference "
          "with Prof. Green. The meeting was scheduled for Jan. 20th, 2021. She moved to Washington, D.C. last year. The U.S. is "
          "known for its diverse culture. He lives on the St. Louis street. The package was marked as fragile, i.e., handle with care. Remember to check the P.O. Box tomorrow. Results reported by Sato et al. in this issue of Neuron now show that the transactivator" 
                "Rora acts coordinately with [TG].")
correct_sent = [sent.text for sent in doc.sents]
incorrect_sent = [sent.text for sent in incorrect.sents]
print('Correctly splitted')
for sent in correct_sent:
    print(f'\t{sent}')
print('Incorrectly splitted')
for sent in incorrect_sent:
    print(f'\t{sent}')

Correctly splitted
	The R2 value was found to be 0.85, indicating a strong correlation.
	The patient was diagnosed with COPD (Chronic Obstructive Pulmonary Disease).
	These findings weresimilar to those reported by Smith et al. in their 2018 study.
	The standard treatment protocol includes 5-FU, followed by radiation therapy.
	The MRI scan wasperformed at 1.5 T. Dr. Smith reviewed the report before the meeting.
	Mrs. Johnson will be attending the conference with Prof. Green.
	The meeting was scheduled for Jan. 20th, 2021.
	She moved to Washington, D.C. last year.
	The U.S. is known for its diverse culture.
	He lives on the St. Louis street.
	The package was marked as fragile, i.e., handle with care.
	Remember to check the P.O. Box tomorrow.
	Results reported by Sato et al. in this issue of Neuron now show that the transactivatorRora acts coordinately with [TG].
Incorrectly splitted
	The concentration of MgSO4 was maintained at 0.5 mM. In fig. 3, we illustrate the cell's response to the

The spacy ner seems to work very well. If any problem is found, a possible alternative is:
https://github.com/victoriastuart/biomedical-sentence-splitter.r.lls.

### Other problems with TRI dataset

In [137]:
# Create a mask for all duplicated rows and apply it to the data
duplicates_mask = TRI_data.duplicated(subset=['TF', 'TG', 'sent_ID'], keep=False)

# Apply to the data and drop duplicates
unique_duplicates = TRI_data[duplicates_mask].drop_duplicates(subset=['TF', 'TG', 'sent_ID'])

display(HTML(f'''
<table>
    <tr><td># sentences</td><td>{len(TRI_data)}</td></tr>
    <tr><td># duplicated rows</td><td>{sum(duplicates_mask)}</td></tr>
    <tr><td># sentences with duplicated rows</td><td>{len(unique_duplicates)}</td></tr>
    <tr><td># unique sentence IDs</td><td>{len(unique_duplicates["sent_ID"].unique())}</td></tr>    
</table>
Example:
'''),
TRI_data[(TRI_data['sent_ID'] == '8276360:3') & (TRI_data['labels'] == 1)][['#TRI ID', 'TF', 'TG', 'Sentence']])

0,1
# sentences,22135
# duplicated rows,738
# sentences with duplicated rows,364
# unique sentence IDs,318


Unnamed: 0,#TRI ID,TF,TG,Sentence
12514,8276360:3:AP1:MYC,AP-1,c-myc,"In this study, we investigated the mechanism of target gene stimulation by preS2/St. It was found that deletion of a fragment containing the binding site for transcription factor [TF] (Jun-Fos) substantially decreases inducibility of the human [TG] promoter by preS2/St. A subsequent investigation of AP-1 activation by preS2/St revealed the following: (a) insertion of multimeric AP-1 binding sites confers inducibility to an otherwise unstimulatable test promoter; (b) transactivation of AP-1 sites is dramatically increased when Jun and Fos are overexpressed by cotransfected expression plasmids; and (c) inhibitors of AP-1 activation also impair transactivation by preS2/St. Besides AP-1, preS2/St was also able to utilize the unrelated transcription factors NF-kappa B and AP-2 for transactivation, suggesting that the gene product of preS2/St acts indirectly through one or several general cellular pathways rather than as a bona fide transcription factor."
12515,8276360:3:FOS:MYC,AP-1,c-myc,"In this study, we investigated the mechanism of target gene stimulation by preS2/St. It was found that deletion of a fragment containing the binding site for transcription factor [TF] (Jun-Fos) substantially decreases inducibility of the human [TG] promoter by preS2/St. A subsequent investigation of AP-1 activation by preS2/St revealed the following: (a) insertion of multimeric AP-1 binding sites confers inducibility to an otherwise unstimulatable test promoter; (b) transactivation of AP-1 sites is dramatically increased when Jun and Fos are overexpressed by cotransfected expression plasmids; and (c) inhibitors of AP-1 activation also impair transactivation by preS2/St. Besides AP-1, preS2/St was also able to utilize the unrelated transcription factors NF-kappa B and AP-2 for transactivation, suggesting that the gene product of preS2/St acts indirectly through one or several general cellular pathways rather than as a bona fide transcription factor."


# Check `pubtator_to_coll.py` file

Check whether the script `pubtator_to_coll.py` works as expected. The file creates a collection of documents with the text, sentences and annotations from a Pubtator file.

In [11]:
!ls ../pubtator/bioc_to_pubtator/truncated_pubtator/

65.pubtator


In [1]:
# Create collection
from pubtator_to_coll import pubtator_to_collection, add_sentences_to_coll

pubtator_path = '../pubtator/bioc_to_pubtator/truncated_pubtator/'
collection = pubtator_to_collection(pubtator_path)
add_sentences_to_coll(collection)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


<pubtator_to_coll.DocumentCollection at 0x7f3c441d5450>

In [2]:
# PREVIOUS CHECKS

# This collection was bringing me problems but I fixed it
# print(collection['PMC8653174'].text)

# Check whether sentence text and PubTator annotations characters coincide - if no output, all annotations coincide.
# This has already been implemented in the Document class and is no longer necessary
for doc, sent in ((doc, sent) for doc in collection for sent in doc.sents):
    for gene in doc.genes:
        if (gene.start_char >= sent.start_char) & (gene.end_char < sent.end_char):
            # Get the annotation inside the sentence
            sent_gene_text = sent.text[gene.start_char - sent.start_char:gene.end_char - sent.start_char]
            if not gene.text == sent_gene_text:
                print(f"{doc.PMID}: {gene.text} and {sent_gene_text} don't coincide")
                print(sent.text[gene.start_char - sent.start_char - 50:gene.end_char - sent.start_char])
                print()

In [3]:
import pandas as pd
# Some analysis of the data
num_ids = len(collection)
num_sentences = len([sent for doc in collection for sent in doc.sents] )
genes   = [gene.ID  for doc in collection for gene in doc.genes]
species_df = pd.DataFrame([[specie.ID, doc.PMID] for doc in collection for specie in doc.species], columns=['specieID', 'PMID'])

print(f'''
{'PMIDs':<25}{num_ids}
{'sents':<25}{num_sentences}
{'species':<25}{len(species_df)}
{'unique species':<25}{len(set(species_df["specieID"]))}
{'PMIDs with species':<25}{len(species_df["PMID"].unique()):<6}({len(species_df["PMID"].unique())/num_ids*100:.0f}%)
{'PMIDs with humans':<25}{len(species_df[species_df["specieID"] == "9606"]["PMID"].unique()):<6}({len(species_df[species_df["specieID"] == "9606"]["PMID"].unique())/num_ids*100:.0f}%)
''')


PMIDs                    1630
sents                    17247
species                  6040
unique species           272
PMIDs with species       1306  (80%)
PMIDs with humans        892   (55%)

