In [1]:
from datasets import load_dataset, load_metric, Dataset
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.init()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msouth_memphis[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
train_dataset = load_dataset("scientific_papers", "pubmed", split="train")
val_dataset = load_dataset("scientific_papers", "pubmed", split="validation")

In [4]:
train_dataset

Dataset({
    features: ['article', 'abstract', 'section_names'],
    num_rows: 119924
})

In [5]:
train_dataset['article'][0]

"a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries .\nin iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively .\nthe prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% .\nanthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight .\nsnack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states\nthere are also some reports regarding school feeding programs in developing countries . in viet

In [6]:
train_dataset['abstract'][0]

" background : the present study was carried out to assess the effects of community nutrition intervention based on advocacy approach on malnutrition status among school - aged children in shiraz , iran.materials and methods : this case - control nutritional intervention has been done between 2008 and 2009 on 2897 primary and secondary school boys and girls ( 7 - 13 years old ) based on advocacy approach in shiraz , iran . \n the project provided nutritious snacks in public schools over a 2-year period along with advocacy oriented actions in order to implement and promote nutritional intervention . for evaluation of effectiveness of the intervention growth monitoring indices of pre- and post - intervention were statistically compared.results:the frequency of subjects with body mass index lower than 5% decreased significantly after intervention among girls ( p = 0.02 ) . \n however , there were no significant changes among boys or total population . \n the mean of all anthropometric ind

In [7]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

In [8]:
max_input_length = 8192
max_output_length = 512
batch_size = 2

In [9]:
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article"], padding="max_length",
        truncation=True, max_length=max_input_length)
    outputs = tokenizer(batch["abstract"], padding="max_length",
        truncation=True,max_length=max_output_length)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [10]:
train_dataset = train_dataset.select(range(250))
val_dataset = val_dataset.select(range(25))

In [11]:
train_dataset = train_dataset.map(process_data_to_model_inputs, batched=True,
    batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])

In [12]:
val_dataset = val_dataset.map(process_data_to_model_inputs,batched=True,
    batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])

In [13]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)
val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [14]:
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

In [15]:
led.config.num_beams = 2
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [16]:
rouge = load_metric("rouge")

  rouge = load_metric("rouge")


In [17]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid
    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [19]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    output_dir="C:/Users/yaroslav/Desktop/LED_running",
    logging_steps=5,
    eval_steps=10,
    save_steps=10,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
)

In [20]:
trainer = Seq2SeqTrainer(
    model=led,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

In [2]:
pubmed_test = load_dataset("scientific_papers", "pubmed", ignore_verifications=True, split="test")



In [22]:
trainer.save_model("C:/Users/yaroslav/Desktop/LED-2")

In [23]:
tokenizer.save_pretrained("C:/Users/yaroslav/Desktop/Tokenizer-2")

('C:/Users/yaroslav/Desktop/Tokenizer-2\\tokenizer_config.json',
 'C:/Users/yaroslav/Desktop/Tokenizer-2\\special_tokens_map.json',
 'C:/Users/yaroslav/Desktop/Tokenizer-2\\vocab.json',
 'C:/Users/yaroslav/Desktop/Tokenizer-2\\merges.txt',
 'C:/Users/yaroslav/Desktop/Tokenizer-2\\added_tokens.json',
 'C:/Users/yaroslav/Desktop/Tokenizer-2\\tokenizer.json')

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained("C:/Users/yaroslav/Desktop/LED-2")
tokenizer = AutoTokenizer.from_pretrained("C:/Users/yaroslav/Desktop/Tokenizer-2")

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
def generate_answer(batch):
    inputs_dict = tokenizer(batch["article"], padding="max_length", max_length=8192, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids
    attention_mask = inputs_dict.attention_mask
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
    batch["predicted_abstract"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    return batch

dataset = Dataset.from_dict(pubmed_test[:12])
result = dataset.map(generate_answer, batched=True, batch_size=4)
rouge = load_metric("rouge")
print("Result:", rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"], rouge_types=["rouge2"])["rouge2"].mid)

Map: 100%|██████████| 12/12 [1:10:57<00:00, 354.76s/ examples]
  rouge = load_metric("rouge")


Result: Score(precision=0.23738993339730624, recall=0.10781269748867248, fmeasure=0.138551280228829)


In [17]:
print(result["predicted_abstract"][2],'\n=======================================================\n', result["abstract"][2])

 in this study, we evaluated the efficacy and safety of pigtail catheter drainage in the management of severe ohss in patients who underwent in vitro fertilization and embryo transfer at our centre between 1999 and 2001 on both inpatient and outpatient basis. 
 we present a retrospective study of 33 patients with severe oss who underwent IVF treatment and were admitted to the hospital for treatment  ( rizk and aboulghar, 2002 ).  the majority of the patients were managed on an outpatient basis with the use of pigtails catheter.  
  objective : to evaluate the efficacy and safety of outpatient management of severe ovarian hyperstimulation syndrome  ( ohss ) requiring placement of a pigtail catheter.methods : retrospective analysis of thirty - three consecutive patients who underwent in - vitro fertilization  ( 2003 - 2009 ) and developed severe / critical ohss requiring placement of a pigtail catheter . 
 patients who were managed on outpatient basis were monitored by frequent office v

### Translation of summarized text: в этом исследовании мы оценили эффективность и безопасность дренирования с помощью катетера в виде косички при лечении тяжелого СГЯ у пациентов, перенесших экстракорпоральное оплодотворение и перенос эмбрионов в нашем центре в период с 1999 по 2001 год как стационарно, так и амбулаторно. мы представляем ретроспективное исследование 33 пациенток с тяжелым течением СОС, которые прошли курс ЭКО и были госпитализированы для лечения (rizk and aboulghar, 2002).   большинство пациентов лечились амбулаторно с использованием катетера "косички".


In [18]:
print(result["predicted_abstract"][5],'\n=======================================================\n', result["abstract"][5])

 
 the present study was conducted at a tertiary care university hospital in brazil, brazil. 
 a total of 142 patients were studied, and the results were qualitatively and quantitatively different from those obtained by bachert et al.  the mean serum levels of staphylococcal toxin - specific ige antibodies in patients with severe asthma were significantly higher than those obtained in the msa group. � the results of the study were as follows : 
 
 serum levels were higher in the two groups of patients than in the ma group, and mean levels were significantly lower in the other. consecutive patients were divided into two groups according to the clinical severity of asthma.  
  abstractobjective : to determine the presence of staphylococcal superantigen - specific ige antibodies and degree of ige - mediated sensitization , as well as whether or not those are associated with the severity of asthma in adult patients . 
 methods : this was a cross - sectional study involving outpatients wit

### Translation of summarized text: настоящее исследование было проведено в университетской больнице третичного звена в Бразилии, Бразилия. всего было обследовано 142 пациента, и результаты качественно и количественно отличались от результатов, полученных Бахертом и соавт.   Cредние сывороточные уровни антител ige, специфичных к стафилококковому токсину, у пациентов с тяжелой бронхиальной астмой были значительно выше, чем в группе msa.  Результаты исследования были следующими : уровни в сыворотке крови были выше в двух группах пациентов, чем в группе ма, а средние уровни были значительно ниже в другой. последовательные пациенты были разделены на две группы в соответствии с клинической тяжестью бронхиальной астмы.

In [19]:
print(result["predicted_abstract"][6],'\n=======================================================\n', result["abstract"][6])

 
 the study was performed in the second and the third days of hospitalization of the patients in the neurosurgery intensive care unit of al - zahra hospital. 
the findings of this study could be a basis for performing further studies about family needs of the patient in the icu especially with the different forms of culture and economy. consequently, the increase in family satisfaction of patients can reduce the stress disorders and improve their mental state and ultimately better support of the client by the family.  
  background : since the family is a social system , the impairment in each of its component members may disrupt the entire family system . 
 one of the stress sources for families is accidents leading to hospitalization particularly in the intensive care unit ( icu ) . in many cases , 
 the families needs in patient care are not met that cause dissatisfaction . since the nurses spend a lot of time with patients and their families , they are in a good position to assess

### Translation of summarized text: исследование проводилось на второй и третий дни госпитализации пациентов в отделение интенсивной терапии нейрохирургии больницы аль-Захра. Результаты этого исследования могли бы стать основой для проведения дальнейших исследований потребностей семьи пациента в отделении интенсивной терапии, особенно с учетом различных форм культуры и экономики. Следовательно, повышение удовлетворенности пациентов семьей может уменьшить стрессовые расстройства и улучшить их психическое состояние и, в конечном счете, улучшить поддержку клиента семьей.

In [20]:
print(result["predicted_abstract"][7],'\n=======================================================\n', result["abstract"][7])

 communication skills for health team can improve the quality of life of patients with coronary artery bypass surgery. 
 this study investigates the role of communication skills in reducing hospital anxiety and depression in patients with cardiovascular disease.  in this study, we examined the interaction between the patient and the medical team. in the intervention group, we used a self - control researcher - made tool approved by experts. in addition, we analyzed the mean anxiety scores of patients in the control group and the mean depression scores of the intervention groups.  
  background and objective :   anxiety and depression are among the psychological disorders in heart 
 surgeries . establishing a simple communication is essential to reduce anxiety and depression . 
 hence , the objective of the present studywas to examine the impact of peplau therapeutic communication model on anxiety and depression in patients , who were candidate for coronary artery bypass in al - zahra 

### Translation of summarized text: навыки общения для медицинской бригады могут улучшить качество жизни пациентов после операции аортокоронарного шунтирования. В этом исследовании исследуется роль навыков общения в снижении больничной тревожности и депрессии у пациентов с сердечно-сосудистыми заболеваниями.  В этом исследовании мы изучили взаимодействие между пациентом и медицинской бригадой. в группе вмешательства мы использовали разработанный исследователями инструмент самоконтроля, одобренный экспертами. кроме того, мы проанализировали средние показатели тревожности пациентов в контрольной группе и средние показатели депрессии в группах вмешательства.