In [1]:
from google.colab import drive
drive.mount('/content/drive')
drive_path = 'drive/MyDrive/imitate-retrieve-paraphrase'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%pip install datasets



In [113]:
from transformers import AutoModelForSeq2SeqLM, GPT2LMHeadModel, AutoModelForSequenceClassification
from transformers import AutoTokenizer, GPT2Tokenizer
from datasets import Dataset
import numpy as np
import nltk
import random

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Load in medline dataset
train_data = Dataset.from_file(f'{drive_path}/data/train/medline_train_data.arrow')
test_data = Dataset.from_file(f'{drive_path}/data/test/medline_test_data.arrow')
print(test_data)

Dataset({
    features: ['title', 'facts', 'style', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6220
})


In [5]:
# Load in pretrained models
imitator = GPT2LMHeadModel.from_pretrained(f'{drive_path}/imitator/model').to('cuda')
retriever = AutoModelForSequenceClassification.from_pretrained(f'{drive_path}/retriever/model').to('cuda')
paraphraser = AutoModelForSeq2SeqLM.from_pretrained(f'{drive_path}/paraphraser/model').to('cuda')

In [6]:
#Load in tokenizers
imitator_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
retriever_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
paraphraser_tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [66]:
special_tokens = {'additional_special_tokens': ['<|topic|>', '<|style|>', '<|fact|>']}
num_added_toks = paraphraser_tokenizer.add_special_tokens(special_tokens)
paraphraser.resize_token_embeddings(len(paraphraser_tokenizer))

Embedding(50268, 1024, padding_idx=1)

In [7]:
# Function for loading in prefix input r
def get_prefix(topic):
  return f'{topic} is used to treat'

In [19]:
# Function for generating next context sentence plan in output
def get_imitated_sentence(prefix):
  inputs = imitator_tokenizer(prefix, return_tensors = 'pt').to('cuda')
  generation_output = imitator.generate(**inputs, return_dict_in_generate = True, output_scores = True, max_new_tokens = 64)
  decoded_output = imitator_tokenizer.decode(generation_output.sequences[0])
  new_sentence = decoded_output.split(prefix)[1].split('.')[0] + '.'
  return new_sentence

In [76]:
# Function for retrieving top k facts based off of current context sentence plan
def get_retrieved_facts(sentence, fact_embeddings, k):
  tokenized_sentence = retriever_tokenizer(sentence, return_tensors = 'pt', truncation = True).input_ids.to('cuda')
  sentence_embedding = retriever(tokenized_sentence, output_hidden_states = True).hidden_states[-1].mean(axis = 1).to('cpu').detach()
  fact_scores = []

  for fact_embedding in fact_embeddings:
    fact_scores.append([float((fact_embedding[0] @ sentence_embedding.T)[0][0]), fact_embedding[1]])

  top_k_facts = sorted(fact_scores, key = lambda x: x[0], reverse = True)[:k]
  return [fact[1] for fact in top_k_facts]

In [104]:
# Function for combining current context sentence plan with top k facts for topic t
def get_paraphrased_sentence(sentence, facts, topic):
  facts_combined = ''
  for fact in facts:
    facts_combined = facts_combined + fact

  model_input = ['<|topic|>' + topic + ' <|fact|> ' + facts_combined + ' <|style|> ' + sentence]
  model_input = paraphraser_tokenizer(model_input, max_length = 512, truncation = True, return_tensors='pt')
  attention_mask = model_input.attention_mask.to('cuda')
  input_ids = model_input.input_ids.to('cuda')
  outputs = paraphraser.generate(input_ids, attention_mask = attention_mask, max_length = 512).to('cpu').detach()
  output_str = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens = True)
  return (''.join(output_str).split('.')[0] + '.')[len(topic) + 1:]

In [108]:
# Function for generating expository text using IRP
def IRP(topic, prefix, facts):
  output = prefix

  fact_embeddings = []
  for fact in facts:
    tokenized_fact = retriever_tokenizer(fact, return_tensors = 'pt', truncation = True).input_ids.to('cuda')
    fact_embedding = retriever(tokenized_fact, output_hidden_states = True).hidden_states[-1].mean(axis = 1).to('cpu').detach()
    fact_embeddings.append([fact_embedding, fact])

  for i in range(1000):
    # Get new imitated sentence
    imitated_sentence = get_imitated_sentence(output)

    if i == 0:
      imitated_sentence = prefix + imitated_sentence

    if imitated_sentence == '<|endoftext|>':
      break

    # Retrieve top facts
    retrieved_facts = get_retrieved_facts(imitated_sentence, fact_embeddings, 1)

    # Paraphrase imitated sentence with retrieved facts
    paraphrased_sentence = get_paraphrased_sentence(imitated_sentence, retrieved_facts, topic)

    if i == 0:
      output = paraphrased_sentence.lstrip().rstrip()
    else:
      output = (output + ' ' + paraphrased_sentence).lstrip().rstrip()

    if i == 3:
      break

  return output

In [115]:
# Inference loop over dataset
results = []
random_indices = random.sample(range(len(test_data)), 5)

for idx in random_indices:
  row = test_data[idx]
  topic = row['title']
  prefix = get_prefix(topic)
  facts = row['facts']
  expository_text = IRP(topic, prefix, facts)
  results.append({'topic': topic, 'prefix': prefix, 'facts': facts, 'expository_text': expository_text})

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.   It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.   It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.    It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It does not work right away and should not be used to relieve sudden asthma attacks.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  This medication is used to prevent certain types of chest pain angina.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  This medication is used to prevent certain types of chest pain angina.  This medication is used to prevent certain types of chest pain angina.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Olanzapine can increase the amount of sugar in your blood and this can sometimes lead to diabetes.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Olanzapine can increase the amount of sugar in your blood and this can sometimes lead to diabetes.  Antipsychotic drugs elevate prolactin levels; the elevation persists during chronic administration.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Olanzapine can increase the amount of sugar in your blood and this can sometimes lead to diabetes.  Antipsychotic drugs elevate prolactin levels; the elevation persists during chronic administration.  Antipsychotic drugs elevate prolactin levels; the elevation persists during chronic administration.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Olanzapine can increase the amount of sugar in your blood and this can sometimes lead to diabetes.  Antipsychotic drugs elevate prolactin levels; the elevation persists during chronic administration.  Antipsychotic drugs elevate prolactin levels; the elevation persists during chronic administration.  Antipsychotic drugs elevate prolactin levels; the elevation persists during chronic administration.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


At clinically relevant doses, modafil significantly increases dopamine in the human brain by blocking dopamine transporters.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


At clinically relevant doses, modafil significantly increases dopamine in the human brain by blocking dopamine transporters.  At clinically relevant doses, modafil significantly increases dopamine in the human brain by blocking dopamine transporters.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


At clinically relevant doses, modafil significantly increases dopamine in the human brain by blocking dopamine transporters.  At clinically relevant doses, modafil significantly increases dopamine in the human brain by blocking dopamine transporters.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer of the white blood cells) that has spread to other parts of the body.
At clinically relevant doses, modafil significantly increases dopamine in the human brain by blocking dopamine transporters.  At clinically relevant doses, modafil significantly increases dopamine in the human brain by blocking dopamine transporters.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer of the white blood cells) that has spread to other parts of the body.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer of the white blood cells) that has spread to other parts of the body.


In [118]:
for result in results:
  print(f'TOPIC: {result["topic"]}')
  print(f'EXPOSITORY TEXT: {result["expository_text"]}')

TOPIC: Enasidenib
EXPOSITORY TEXT: is used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.  It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.   It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.    It is also used to treat certain types of chronic myeloid leukemia (CML; a cancer that starts in the white blood cells) in adults and children 2 years of age and older.
TOPIC: Zafirlukast
EXPOSITORY TEXT: It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks.  It does not work right away and should not be used to relieve sudden asthma attacks