In [3]:
import json
from transformers import pipeline
import torch

In [4]:
def load_model():
    generator = pipeline(
        "text-generation",
        model="runs/exp_1/checkpoint-2436",
        device=torch.device("cuda:0"),
    )
    return generator


PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}


def run(generator, prompt):
    out = generator(
        prompt, max_length=len(prompt.split(" ")) + 400, num_return_sequences=5, temperature=0.5
    )

    return out

In [63]:
generator = load_model()

In [7]:
new_instruction = {
    "Simplify the following piece of text in the {lang} language",
    "ELI5 in the {lang} language",
    "Simplify the following article conceptually in the {lang} language",
}


english_passages = [
    "The burgeoning field of astrophysics has given rise to numerous remarkable discoveries, elucidating the mysteries of the cosmos and the celestial bodies that populate it. Astrophysicists have ascertained that the universe is approximately 13.8 billion years old and is continually expanding, a phenomenon corroborated by the redshift of light from distant galaxies. Moreover, the existence of dark matter and dark energy, which constitute roughly 95% of the total mass-energy content in the universe, has been posited based on the gravitational effects observed on visible matter and the expansion rate of the universe.",    
    "Renowned British naturalist and broadcaster Sir David Attenborough has spent the better part of his life advocating for the preservation of Earth's biodiversity, educating millions about the intricate relationships between various species and their ecosystems. Through his riveting television documentaries, Attenborough has chronicled the breathtaking beauty of our planet, as well as the devastating consequences of human activities on the environment. His passionate narration and unyielding commitment to conservation have inspired countless individuals to take action in preserving the natural world for future generations."
]

arabic_passages = [
    "أدى النمو المتزايد في مجال علم الفلك إلى العديد من الاكتشافات الرائعة التي توضح أسرار الكون والأجرام السماوية التي تملأه. حدد علماء الفلك أن عمر الكون يبلغ تقريبا 13.8 مليار عام ويتوسع باستمرار، وهو ظاهرة تؤكدها الانزياح الأحمر للضوء الصادر عن المجرات البعيدة. علاوة على ذلك، تم افتراض وجود المادة المظلمة والطاقة المظلمة، التي تشكل حوالي 95٪ من محتوى الكتلة والطاقة الإجمالي في الكون، بناءً على التأثيرات الجاذبة الملاحظة على المادة المرئية ومعدل توسع الكون.",
    
    "أمضى العالم البريطاني المشهور ومذيع البرامج الوثائقية السير ديفيد أتنبره جزءًا كبيرًا من حياته في الدعوة إلى الحفاظ على التنوع البيولوجي للأرض، وتوعية الملايين بالعلاقات المعقدة بين الأنواع المختلفة والنظم البيئية التي تعيش فيها. من خلال برامجه الوثائقية المثيرة على التلفزيون،"
]


all_passages = english_passages + arabic_passages

all_data = []
for example in all_passages:
    for instruction in new_instruction:
        all_data.append({"instruction": instruction, "output": "", "input": example})

In [8]:
all_data

[{'instruction': 'Simplify the following article conceptually in the {lang} language',
  'output': '',
  'input': 'The burgeoning field of astrophysics has given rise to numerous remarkable discoveries, elucidating the mysteries of the cosmos and the celestial bodies that populate it. Astrophysicists have ascertained that the universe is approximately 13.8 billion years old and is continually expanding, a phenomenon corroborated by the redshift of light from distant galaxies. Moreover, the existence of dark matter and dark energy, which constitute roughly 95% of the total mass-energy content in the universe, has been posited based on the gravitational effects observed on visible matter and the expansion rate of the universe.'},
 {'instruction': 'ELI5 in the {lang} language',
  'output': '',
  'input': 'The burgeoning field of astrophysics has given rise to numerous remarkable discoveries, elucidating the mysteries of the cosmos and the celestial bodies that populate it. Astrophysicists

In [9]:
from copy import deepcopy

sources = []
for example in all_data:
    prompt_input, prompt_no_input = deepcopy(PROMPT_DICT["prompt_input"]), deepcopy(PROMPT_DICT["prompt_no_input"])
    if example.get("input", "") != "":
        print(example)
        sources.append(prompt_input.format_map(example))
    else:
        sources.append(prompt_no_input.format_map(example))

{'instruction': 'Simplify the following article conceptually in the {lang} language', 'output': '', 'input': 'The burgeoning field of astrophysics has given rise to numerous remarkable discoveries, elucidating the mysteries of the cosmos and the celestial bodies that populate it. Astrophysicists have ascertained that the universe is approximately 13.8 billion years old and is continually expanding, a phenomenon corroborated by the redshift of light from distant galaxies. Moreover, the existence of dark matter and dark energy, which constitute roughly 95% of the total mass-energy content in the universe, has been posited based on the gravitational effects observed on visible matter and the expansion rate of the universe.'}
{'instruction': 'ELI5 in the {lang} language', 'output': '', 'input': 'The burgeoning field of astrophysics has given rise to numerous remarkable discoveries, elucidating the mysteries of the cosmos and the celestial bodies that populate it. Astrophysicists have ascer

In [78]:
outputs = []
for idx, source in enumerate(sources):
    out = run(generator, source)
    exa = all_data[idx]
    to_out = {}
    to_out.update(exa)
    to_out["output"] = [i['generated_text'].replace(source, "") for i in out]
    outputs.append(to_out)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [70]:
print(outputs[2][3]['generated_text'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the main ideas of this chapter in a single sentence.

### Input:
There was music from my neighbor's house through the summer nights. In his blue gardens men and girls came and went like moths among the whisperings and the champagne and the stars. At high tide in the afternoon I watched his guests diving from the tower of his raft or taking the sun on the hot sand of his beach while his two motor-boats slit the waters of the Sound, drawing aquaplanes over cataracts of foam. On week-ends his Rolls-Royce became an omnibus, bearing parties to and from the city, between nine in the morning and long past midnight, while his station wagon scampered like a brisk yellow bug to meet all trains. And on Mondays eight servants including an extra gardener toiled all day with mops and scrubbing-brushes and hammers and

In [79]:
with open("eval_outputs.json", "w") as f:
    json.dump(outputs, f, indent=4)

In [80]:
import json

with open("/home/osama/projects/nlp702/assignment_2/stanford_alpaca/alpaca_data.json", "r") as f:
    data = json.load(f)

In [83]:
len(data)

52002