In [1]:
from transformers import pipeline
pipe = pipeline("text2text-generation", model="p208p2002/bart-squad-qg-hl")

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
import re
import pandas as pd
import requests
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
def get_relation(sent):

  doc = nlp(sent)
  matcher = Matcher(nlp.vocab)

  #define the pattern, finds relation through identifying the 'ROOT' tag, which is the central relation.
  #Also finds optional preposition terms or agent terms or adjective terms, these are all optional
  pattern = [[{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            #addd pcomp maybe? google it
            {'POS':'ADJ','OP':"?"}]]

  matcher.add("matching_1", pattern) 

  matches = matcher(doc)
  if matches:
        k = len(matches) - 1
        span = doc[matches[k][1]:matches[k][2]]
        return span.text
  else:
        return None

In [30]:
# def get_entities(texts):

#     entities = []
#     for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts)):
#         ent1 = ent2 = ""
#         prv_tok_dep = prv_tok_text = ""
#         compound_or_modifier = ""

#         for tok in doc:
#             if tok.dep_ == "punct":
#                 continue  # Skip punctuation tokens

#             # Handle compound or modifier tokens
#             if tok.dep_ == "compound" or tok.dep_.endswith("mod"):
#                 compound_or_modifier = f"{prv_tok_text + ' ' if prv_tok_dep == 'compound' else ''}{tok.text}"

#             # Entity 1: subject
#             if "subj" in tok.dep_:
#                 ent1 = f"{compound_or_modifier} {tok.text}".strip()
#                 compound_or_modifier = ""  # Reset after use

#             # Entity 2: object
#             if "obj" in tok.dep_:
#                 ent2 = f"{compound_or_modifier} {tok.text}".strip()

#             # Update previous token variables
#             prv_tok_dep, prv_tok_text = tok.dep_, tok.text

#         entities.append([ent1, ent2])

#     return entities


def get_entities(texts):
    entities = []

    for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts)):
        ent1 = ent2 = ""
        ent1_complete = ent2_complete = False

        for tok in doc:
            # Skipping punctuation tokens
            if tok.dep_ == "punct":
                continue

            # Capturing the subject entity
            if "subj" in tok.dep_ and not ent1_complete:
                ent1 = tok.text
                for ancestor in tok.ancestors:
                    if ancestor.dep_ in ["ROOT", "conj"]:
                        # Including modifiers and compounds directly related to the entity
                        mods = [child.text for child in tok.lefts if child.dep_ in ["amod", "compound"]]
                        ent1 = " ".join(mods + [tok.text])
                        # Capturing relative clauses introduced by "that" or "which"
                        for child in ancestor.children:
                            if child.dep_ in ["relcl"] and child.nbor(-1).dep_ == "nsubj" and child.nbor(-1).head == tok:
                                ent1 += " " + " ".join([child.nbor(-1).text] + [child.text] + [grandchild.text for grandchild in child.children if grandchild.dep_ not in ["nsubj"]])
                        ent1_complete = True
                        break

            # Capturing the object entity
            if "obj" in tok.dep_ and not ent2_complete:
                ent2 = tok.text
                for ancestor in tok.ancestors:
                    if ancestor.dep_ in ["ROOT"]:
                        mods = [child.text for child in tok.lefts if child.dep_ in ["amod", "compound"]]
                        ent2 = " ".join(mods + [tok.text])
                        # Capturing relative clauses introduced by "that" or "which"
                        for child in ancestor.children:
                            if child.dep_ in ["relcl"] and child.nbor(-1).dep_ == "nsubj" and child.nbor(-1).head == tok:
                                ent2 += " " + " ".join([child.nbor(-1).text] + [child.text] + [grandchild.text for grandchild in child.children if grandchild.dep_ not in ["nsubj"]])
                        ent2_complete = True
                        break

        entities.append([ent1, ent2])

    return entities



# Example usage
texts = ["the development of highly safe vaccine is delayed", "The quick brown fox that is happy jumps over the lazy dog .", "SpaCy is an open-source software library for advanced natural language processing.", "the amazing film that has a good quality is presented had 200 patents"]
entities = get_entities(texts)
relations = get_relation(texts[0])
print(entities)
print(relations)

100%|██████████| 4/4 [00:00<00:00, 500.08it/s]

[['development', 'safe vaccine'], ['quick brown fox', 'lazy dog'], ['SpaCy', 'advanced language processing'], ['amazing film', 'good quality']]
delayed





split summaries into sentences

In [5]:
df = pd.read_csv('../abstractive_summaries.csv')

sentences = []
for index, row in df.iterrows():
    topic_sentences = sent_tokenize(row['Abstractive Summary'])
    for sentence in topic_sentences:
        sentences.append({'Topic': row['Topic'], 'Sentence': sentence})

# Creating a new dataframe with topic IDs and individual sentences
df_sentences = pd.DataFrame(sentences)

# Example to display the first few rows of the new dataframe
df_sentences

Unnamed: 0,Topic,Sentence
0,-1,The development of highly immunogenic and safe...
1,-1,No one type of vaccine will likely fill the gl...
2,-1,MCPyV is a DNA virus with oncogenic potential.
3,-1,About 80 of MCC cases are caused by M CPyV inf...
4,0,China imposed the coronavirus lockdown in the ...
...,...,...
483,154,ZIKV has not impacted as many lives as SARS-Co...
484,154,placental and brain infections in fetuses repr...
485,155,The first Covid-19 listed studies with pediatr...
486,155,"Half of our patients had comorbidities, which ..."


extract entities and relationships from each sent

In [31]:
df_sentences['Entities'] = get_entities(df_sentences['Sentence'].tolist())
df_sentences['Relation'] = df_sentences['Sentence'].apply(get_relation)


100%|██████████| 488/488 [00:00<00:00, 505.03it/s]


tag the extracted info with [HL]

In [32]:
def tag_entity_or_relation(sentence, answer):
    tagged_sentence = sentence.replace(answer, f"[HL] {answer} [HL]")
    return tagged_sentence

In [33]:
question_data = []
for index, row in df_sentences.iterrows():
    topic_id = row['Topic']
    sentence = row['Sentence']
    entities = row['Entities']  # Assuming this is a list of entities
    relation = row['Relation']

    # Tagging and adding entities
    for entity in entities:
        if entity:  # Check if entity is not empty
            tagged_sentence = tag_entity_or_relation(sentence, entity)
            question_data.append({'Topic': topic_id, 'TaggedSentence': tagged_sentence, 'Answer': entity})

    # Tagging and adding the relationship, if it exists
    if relation:
        tagged_sentence = tag_entity_or_relation(sentence, relation)
        question_data.append({'Topic': topic_id, 'TaggedSentence': tagged_sentence, 'Answer': relation})

# Convert the prepared data into a dataframe
df_questions = pd.DataFrame(question_data)


In [34]:
tqdm.pandas()
def apply_pipe(text):
    output = pipe(text)
    return output[0]['generated_text']

generate zero shot questions

In [35]:
df_questions['Question'] = df_questions['TaggedSentence'].progress_apply(apply_pipe)
df_questions

100%|██████████| 1430/1430 [08:12<00:00,  2.90it/s]


Unnamed: 0,Topic,TaggedSentence,Answer,Question
0,-1,The [HL] development [HL] of highly immunogeni...,development,What is critical for controlling the COVID-19 ...
1,-1,The development of highly immunogenic and safe...,immunogenic vaccines,The development of highly immunogenic and safe...
2,-1,The development of highly immunogenic and safe...,be critical,What will the development of highly immunogeni...
3,-1,No one [HL] type [HL] of vaccine will likely f...,type,What type of vaccine will likely fill the glob...
4,-1,No one type of [HL] vaccine [HL] will likely f...,vaccine,What type of vaccine will likely fill the glob...
...,...,...,...,...
1425,155,Half of our [HL] patients [HL] had comorbiditi...,patients,How many patients had comorbidities?
1426,155,Half of our patients [HL] had [HL] comorbiditi...,had,Half of our patients had what type of comorbid...
1427,155,The long-[HL] term impact [HL] of neurological...,term impact,What is uncertain about the long term impact o...
1428,155,The long-term impact of [HL] neurological dama...,neurological damage,What is uncertain about the long-term impact o...


In [36]:
df_questions.to_csv('../question_answer_pair_2.csv', index=False)