In [15]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text2text-generation", model="p208p2002/bart-squad-qg-hl")

All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.
config.json: 100%|██████████| 1.45k/1.45k [00:00<00:00, 170kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model.safetensors: 100%|██████████| 1.33G/1.33G [01:21<00:00, 16.4MB/s]
All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was train

In [6]:
input_text = "[HL] Python [HL] is an interpreted, [HL] high-level [HL], [HL] general-purpose [HL] programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace."

# Use the pipeline to generate text based on the input
output = pipe(input_text)

# Print the generated text
for o in output:
    print(o['generated_text'])

What type of programming language is Python?


Idea: load back all ngrams, clean stopwords and punkt from it

In [46]:
import pickle
with open("../all_ngrams.pkl", "rb") as input_file:
    all_ngrams = pickle.load(input_file)

In [45]:
import pandas as pd
df = pd.read_csv('../abstractive_summaries.csv')
sentences = df['Abstractive Summary'].tolist()
sentences = sentences[1:]

In [63]:
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [114]:
import re
def extract_answers(text, nlp):
    doc = nlp(text)
    results = []

    # Function to determine if an n-gram is significant
    def is_significant(phrase):
        return any(token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'VERB'] and not token.is_stop for token in phrase)

    for sent in doc.sents:
        tokens = [token for token in sent if not token.is_punct and not token.is_space]

        # Generate up to trigrams
        for n in range(1, 4):  # For unigrams, bigrams, and trigrams
            ngrams = [tokens[i:i+n] for i in range(len(tokens) - n + 1)]
            for ngram in ngrams:
                if is_significant(ngram):
                    ngram_text = " ".join(token.text for token in ngram)
                    pattern = re.escape(ngram_text)
                    replacement = "[HL]" + ngram_text + "[HL]"
                    highlighted_ngram = re.sub(pattern, replacement, text)
                    results.append((highlighted_ngram, ngram_text))

    return results


In [120]:
all_results = []
index = []
for idx, text in enumerate(sentences):
    results = extract_answers(text, nlp)
    all_results.extend(results)
    index.extend([idx] * len(results))

In [128]:
extracted_texts = [result[0] for result in all_results]
highlighted_texts = [result[1] for result in all_results]

new_df = pd.DataFrame({
    'index': index,
    'highlighted_sent': extracted_texts,
    'answer': highlighted_texts
})

The wacky part begins, here, we will input the highlighted parts into the pipe, and obtain an answer

In [132]:
from sklearn.model_selection import train_test_split


_, sample_df = train_test_split(new_df, test_size=0.1, stratify=new_df['index'], random_state=42)

sample_df


Unnamed: 0,index,highlighted_sent,answer
20437,146,Antioxidants are compounds that can delay or i...,oxidative stress
21426,153,All kidney transplant recipients at our center...,28 recipients
8668,61,"Thrombocytosis, an uncommon side effect of all...",of all trans
7774,55,Disruption risks in supply chains are low-freq...,The existing
21687,155,The first Covid-19 listed studies with pediatr...,develop neurological symptoms
...,...,...,...
465,3,"In early 2020, the emerging respiratory virus ...",high
18483,132,"Diedel, a protein secreted from peripheral tis...",peroxide can
5155,35,Animal models are crucial for the study of sev...,The World Health
18712,134,Uganda hosts the largest number of refugees in...,access services


In [133]:
new_df.to_csv('../extracted_HL.csv')

In [134]:
from tqdm.auto import tqdm
tqdm.pandas()
def apply_pipe(text):
    output = pipe(text)
    return output[0]['generated_text']

sample_df['question'] = sample_df['highlighted_sent'].progress_apply(apply_pipe)
sample_df

100%|██████████| 2169/2169 [1:47:38<00:00,  2.98s/it]


Unnamed: 0,index,highlighted_sent,answer,question
20437,146,Antioxidants are compounds that can delay or i...,oxidative stress,What is a major role of antioxidants in cancer...
21426,153,All kidney transplant recipients at our center...,28 recipients,How many kidney transplant recipients have und...
8668,61,"Thrombocytosis, an uncommon side effect of all...",of all trans,What is one side effect of all-trans retinoic ...
7774,55,Disruption risks in supply chains are low-freq...,The existing,What type of interconnectivity could be a key ...
21687,155,The first Covid-19 listed studies with pediatr...,develop neurological symptoms,What is uncertain about the long-term impact o...
...,...,...,...,...
465,3,"In early 2020, the emerging respiratory virus ...",high,What type of efficacy does PPE have?
18483,132,"Diedel, a protein secreted from peripheral tis...",peroxide can,What is the name of the enzyme that quenches h...
5155,35,Animal models are crucial for the study of sev...,The World Health,Who defines zoonotic disease as?
18712,134,Uganda hosts the largest number of refugees in...,access services,What has the COVID-19 prevention and control s...


In [153]:
sample_df.to_csv('../question_answer_pair.csv', index=False)

In [154]:
sample_df = pd.read_csv('../question_answer_pair.csv')
sample_df

Unnamed: 0,index,highlighted_sent,answer,question
0,146,Antioxidants are compounds that can delay or i...,oxidative stress,What is a major role of antioxidants in cancer...
1,153,All kidney transplant recipients at our center...,28 recipients,How many kidney transplant recipients have und...
2,61,"Thrombocytosis, an uncommon side effect of all...",of all trans,What is one side effect of all-trans retinoic ...
3,55,Disruption risks in supply chains are low-freq...,The existing,What type of interconnectivity could be a key ...
4,155,The first Covid-19 listed studies with pediatr...,develop neurological symptoms,What is uncertain about the long-term impact o...
...,...,...,...,...
2164,3,"In early 2020, the emerging respiratory virus ...",high,What type of efficacy does PPE have?
2165,132,"Diedel, a protein secreted from peripheral tis...",peroxide can,What is the name of the enzyme that quenches h...
2166,35,Animal models are crucial for the study of sev...,The World Health,Who defines zoonotic disease as?
2167,134,Uganda hosts the largest number of refugees in...,access services,What has the COVID-19 prevention and control s...
