# Mistral
## Imports


In [2]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [3]:
!pip install rouge_score
!pip install datasets
!pip install numpy
!pip install torch
!pip install nltk

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=0a0258c64f2d7f1720ccd896fd1b220dd3116e1d4d91127223ef47a9d7b7f7b1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_c

In [139]:
import json
import pandas as pd
import os
import unicodedata
import math
import argparse
import random
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
from unsloth import FastLanguageModel
import torch

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Loading base Mistral model


In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inferenc

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


In [136]:
COLAB_ENABLED=True

if COLAB_ENABLED:
    DATA_PATH = "/content/drive/MyDrive/BIO_NLP"
    from google.colab import drive
    drive.mount('/content/drive')
    #%% md
else:
    DATA_PATH = "./data"

# Load the dataset
with open(f'/content/drive/MyDrive/BIO_NLP/data.json', 'r') as f:
    data = json.load(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(json.dumps(data, indent=4))

In [137]:

dfs = []
question_data = {"question_id": [], "question": [], "question_type": [], "title": "", "adaptations": [], "abstracts": []}
for key, value in data.items():
    abstracts = []
    adaptations = []
    for sub_key, sub_value in value.items():
        if isinstance(sub_value, dict):
            for adaptation in sub_value['adaptations']:
                for adaptation_key, adaptation_value in sub_value['adaptations'][adaptation].items():
                    adaptations.append(adaptation_value)
                    abstracts.append(sub_value['abstract'][adaptation_key])
        question_data["abstracts"] = abstracts
        question_data["adaptations"] = adaptations
        question_data["question_id"] = [key] * len(adaptations)
        question_data["question_type"] = [value["question_type"]] * len(adaptations)
        question_data["question"] = [value["question"]] * len(adaptations)
        question_data["title"] = [value["question"]] * len(adaptations)
        assert len(adaptations) == len(abstracts), f"len {len(adaptations)} not equal {len(abstracts)}"
    dfs.append(pd.DataFrame.from_dict(question_data))

df = pd.concat(dfs)
# print(df.head())
print(df['question'].head())


0    What causes muscle spasm?
1    What causes muscle spasm?
2    What causes muscle spasm?
3    What causes muscle spasm?
4    What causes muscle spasm?
Name: question, dtype: object


In [12]:

df.to_csv('/content/drive/MyDrive/BIO_NLP/test_dataset.csv', index=False)


In [140]:

# Split up dataset into train/val/test -> 70/15/15
if not os.path.exists(DATA_PATH + 'train.csv'):
  # Clean the question column
  df['question_id'] = df['question_id'].astype(str)

  # Define test size (15%) and validation size (15%)
  test_size = 0.15
  val_size = 0.15
  train_size = 1 - test_size - val_size  # Calculate train size based on others

  # Split data using train_test_split (random state for reproducibility)
  train_val, test = train_test_split(df, test_size=test_size, random_state=42)
  train, val = train_test_split(train_val, test_size=val_size/(val_size+train_size), random_state=42)

  dfs = {'train': train, 'val': val, 'test': test}

  # Save each to CSV file
  for key, df in dfs.items():
      df.to_csv(DATA_PATH + key + ".csv", index=False, encoding='utf-8-sig')
else:
    train = pd.read_csv(DATA_PATH + 'train.csv', header=0)
    val = pd.read_csv(DATA_PATH + 'val.csv', header=0)
    test = pd.read_csv(DATA_PATH + 'test.csv', header=0)
    datasets = {'train':train, 'val':val, 'test':test}

print("Train question IDs:", train['question_id'].tolist())
print("Number of entries in test set:", len(test))
print("Number of entries in val set:", len(val))
print("Number of entries in train set:", len(train))


Train question IDs: ['33', '50', '70', '20', '32', '30', '37', '1', '36', '3', '73', '73', '1', '18', '14', '25', '13', '16', '10', '27', '39', '47', '16', '5', '75', '31', '1', '15', '24', '25', '52', '59', '41', '21', '45', '4', '16', '26', '42', '45', '53', '43', '52', '65', '32', '25', '38', '10', '21', '6', '36', '50', '33', '62', '45', '11', '52', '44', '13', '17', '12', '26', '20', '8', '55', '44', '63', '20', '22', '16', '13', '59', '13', '71', '5', '46', '31', '72', '38', '71', '37', '38', '63', '40', '43', '19', '16', '23', '44', '10', '17', '42', '38', '35', '7', '16', '30', '42', '46', '55', '21', '50', '29', '45', '30', '26', '49', '61', '24', '65', '44', '5', '35', '25', '2', '19', '46', '12', '15', '9', '31', '41', '16', '75', '4', '73', '13', '33', '56', '51', '64', '35', '58', '25', '70', '13', '25', '35', '48', '10', '5', '7', '9', '45', '5', '2', '54', '50', '58', '42', '67', '33', '6', '27', '13', '19', '34', '30', '51', '10', '70', '50', '22', '7', '43', '20', '30'

In [None]:
!pip install sacrebleu sacremoses

In [15]:
## Inference
prompt = """
### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(data):
    abstracts       = data["abstracts"]
    adaptations      = data["adaptations"]
    # print(abstracts)
    print()
    texts = []
    for input, output in zip(abstracts, adaptations):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(input, output) + EOS_TOKEN
        print(text)
        texts.append(text)
    return { "text" : texts, }


from datasets import Dataset, concatenate_datasets

dataset = concatenate_datasets([Dataset.from_pandas(datasets['train']), Dataset.from_pandas(datasets['val'])])
dataset = dataset.map(formatting_prompts_func, batched = True,)

# train_dataset = Dataset.from_pandas(datasets['train'])
# train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/7946 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
In total, 37 healthy older adults (age = 74.9 ± 0.8 years) were recruited.

### Response:
In total, 37 healthy older adults (age = 74.9 ± 0.8 years) are recruited.</s>

### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
The participants were randomly assigned to either a morning group (MG, n = 18) or an evening group (EG, n = 17).

### Response:
The participants are randomly assigned to either a morning group (18 participants) or an evening group (17 participants).</s>

### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
The MG and EG groups were instructed to take Helianthus tuberosus powder (5 g/day) just before breakfast or dinner, respectively, for 1 week after the 1-week control period.

### Re

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Finetuning using unsloth (RUN ONLY IF YOU WANT TO FINETUNE)


In [16]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [17]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        # max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/7946 [00:00<?, ? examples/s]

In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,946 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 993
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.472
2,2.6829
3,2.4095
4,1.9762
5,1.8148
6,1.474
7,1.1542
8,1.1532
9,0.9859
10,0.9539


### Saving the model locally

In [19]:
DATA_PATH = '/content/drive/MyDrive/BIO_NLP/'

In [20]:
model.save_pretrained(DATA_PATH + "lora/lora_model_3") # Local saving
tokenizer.save_pretrained(DATA_PATH + "lora/lora_model_3")

('/content/drive/MyDrive/BIO_NLP/lora/lora_model_3/tokenizer_config.json',
 '/content/drive/MyDrive/BIO_NLP/lora/lora_model_3/special_tokens_map.json',
 '/content/drive/MyDrive/BIO_NLP/lora/lora_model_3/tokenizer.model',
 '/content/drive/MyDrive/BIO_NLP/lora/lora_model_3/added_tokens.json')

# Inference

In [112]:
# Loading the model, change to True if you want to
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def mistral_finetune_inference(inputData):
    ### Instruction:
    inference_prompt = """
    ### Instruction:
    You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

    ### Input:
    {}

    ### Response:
    {}"""

    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        # inference_prompt.format(
        #     "We used spectral-domain optical coherence tomography to image macular regions and measure retinal thickness and Snellen chart visual acuity (VA) to evaluate best-corrected VA (BCVA) at 1, 2, 3, 6, 9, and 12 months after vitrectomy.</s>", # input
        #     "", # output - leave this blank for generation!
        # )
        inference_prompt.format(inputData, "")
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, early_stopping=True, pad_token_id=tokenizer.eos_token_id )

    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract the response part
    response_start = decoded_output.find("### Response:") + len("### Response:")
    response = decoded_output[response_start:].strip()
    print(response)

    return response

In [113]:
response = mistral_finetune_inference("We used spectral-domain optical coherence tomography to image macular regions and measure retinal thickness and Snellen chart visual acuity (VA) to evaluate best-corrected VA (BCVA) at 1, 2, 3, 6, 9, and 12 months after vitrectomy.")
print(response)





In [135]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load BioBART model and tokenizer
bio_bart_tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-v2-large")
bio_bart_model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-v2-large").to(torch.device("cuda"))

# def bio_bart_inference(input_text):
#     inference_prompt = """
#     ### Instruction:
#     You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

#     ### Input:
#     {}

#     ### Response:
#     """.strip()
#     # inference_prompt = """
#     # Translate the following medical text into layman's terms.

#     # Medical text:
#     # {}

#     # Simplified explanation:
#     # {}"""#.strip()
#     bio_bart_model.eval()

#     # input_text = inference_prompt.format(
#     #     "We used spectral-domain optical coherence tomography to image macular regions and measure retinal thickness and Snellen chart visual acuity (VA) to evaluate best-corrected VA (BCVA) at 1, 2, 3, 6, 9, and 12 months after vitrectomy.",  # input
#     #     ""  # output - leave this blank for generation!
#     # )
#     encoded_input = inference_prompt.format(input_text)#.encode('utf-8')
#     print("Formatted Input:\n", encoded_input)
#     inputs = bio_bart_tokenizer(
#         encoded_input, #.decode('utf-8'),
#         return_tensors="pt",
#         max_length=256,
#         padding='max_length'
#     ).to(torch.device("cuda"))

#     print("Tokenized Input IDs:\n", inputs["input_ids"])
#     # Generate outputs
#     outputs = bio_bart_model.generate(
#         inputs["input_ids"],
#         # max_new_tokens=64,
#         max_length=128,
#         num_beams=5,
#         use_cache = True,
#         early_stopping=True,
#         # temperature=0.7,
#         # top_p=0.9,
#         # repetition_penalty=1.2
#     )

#     decoded_output = bio_bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
#     print("Decoded Output:\n", decoded_output)
#     # Extract the response part
#     response_start = decoded_output.find("### Response:") + len("### Response:")
#     # response_start = decoded_output.find("Simplified explanation:") + len("Simplified explanation:")
#     response = decoded_output[response_start:].strip()

#     print(response)
#     return response
inference_prompt = """
### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
{}

### Response:
""".strip()

def generate_outputs_Bart(inputs):
    model.eval()
    results = []
    for input_text in inputs:
        formatted_input = inference_prompt.format(input_text)
        inputs = bio_bart_tokenizer([formatted_input], return_tensors="pt", max_length=512, truncation=True, padding=True).to("cuda")
        outputs = bio_bart_model.generate(inputs["input_ids"], max_new_tokens=256, num_beams=5, early_stopping=True, pad_token_id=bio_bart_tokenizer.eos_token_id, temperature=0.7, top_p=0.9, repetition_penalty=1.2)
        decoded_output = bio_bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        # Extract the response part
        response_start = decoded_output.find("### Response:") + len("### Response:")
        response = decoded_output[response_start:].strip()
        results.append(response)
    return results





KeyboardInterrupt: 

In [None]:
df = pd.read_csv('/content/drive/MyDrive/BIO_NLP/test_dataset.csv')
abstracts = df['abstracts'].tolist()

# Generate outputs
outputs1 = generate_outputs_Bart(abstracts)
print(outputs1)

In [None]:
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


# Evaluation

## Sari functions
From https://github.com/cocoxu/simplification/blob/master/SARI.py

In [34]:
from collections import Counter
import sys

In [35]:
def ReadInFile (filename):

    with open(filename) as f:
        lines = f.readlines()
        lines = [x.strip() for x in lines]
    return lines

In [36]:
def SARIngram(sgrams, cgrams, rgramslist, numref):
    rgramsall = [rgram for rgrams in rgramslist for rgram in rgrams]
    rgramcounter = Counter(rgramsall)

    sgramcounter = Counter(sgrams)
    sgramcounter_rep = Counter()
    for sgram, scount in sgramcounter.items():
        sgramcounter_rep[sgram] = scount * numref

    cgramcounter = Counter(cgrams)
    cgramcounter_rep = Counter()
    for cgram, ccount in cgramcounter.items():
        cgramcounter_rep[cgram] = ccount * numref


    # KEEP
    keepgramcounter_rep = sgramcounter_rep & cgramcounter_rep
    keepgramcountergood_rep = keepgramcounter_rep & rgramcounter
    keepgramcounterall_rep = sgramcounter_rep & rgramcounter

    keeptmpscore1 = 0
    keeptmpscore2 = 0
    for keepgram in keepgramcountergood_rep:
        keeptmpscore1 += keepgramcountergood_rep[keepgram] / keepgramcounter_rep[keepgram]
        keeptmpscore2 += keepgramcountergood_rep[keepgram] / keepgramcounterall_rep[keepgram]
        #print "KEEP", keepgram, keepscore, cgramcounter[keepgram], sgramcounter[keepgram], rgramcounter[keepgram]
    keepscore_precision = 0
    if len(keepgramcounter_rep) > 0:
    	keepscore_precision = keeptmpscore1 / len(keepgramcounter_rep)
    keepscore_recall = 0
    if len(keepgramcounterall_rep) > 0:
    	keepscore_recall = keeptmpscore2 / len(keepgramcounterall_rep)
    keepscore = 0
    if keepscore_precision > 0 or keepscore_recall > 0:
        keepscore = 2 * keepscore_precision * keepscore_recall / (keepscore_precision + keepscore_recall)


    # DELETION
    delgramcounter_rep = sgramcounter_rep - cgramcounter_rep
    delgramcountergood_rep = delgramcounter_rep - rgramcounter
    delgramcounterall_rep = sgramcounter_rep - rgramcounter
    deltmpscore1 = 0
    deltmpscore2 = 0
    for delgram in delgramcountergood_rep:
        deltmpscore1 += delgramcountergood_rep[delgram] / delgramcounter_rep[delgram]
        deltmpscore2 += delgramcountergood_rep[delgram] / delgramcounterall_rep[delgram]
    delscore_precision = 0
    if len(delgramcounter_rep) > 0:
    	delscore_precision = deltmpscore1 / len(delgramcounter_rep)
    delscore_recall = 0
    if len(delgramcounterall_rep) > 0:
    	delscore_recall = deltmpscore1 / len(delgramcounterall_rep)
    delscore = 0
    if delscore_precision > 0 or delscore_recall > 0:
        delscore = 2 * delscore_precision * delscore_recall / (delscore_precision + delscore_recall)


    # ADDITION
    addgramcounter = set(cgramcounter) - set(sgramcounter)
    addgramcountergood = set(addgramcounter) & set(rgramcounter)
    addgramcounterall = set(rgramcounter) - set(sgramcounter)

    addtmpscore = 0
    for addgram in addgramcountergood:
        addtmpscore += 1

    addscore_precision = 0
    addscore_recall = 0
    if len(addgramcounter) > 0:
    	addscore_precision = addtmpscore / len(addgramcounter)
    if len(addgramcounterall) > 0:
    	addscore_recall = addtmpscore / len(addgramcounterall)
    addscore = 0
    if addscore_precision > 0 or addscore_recall > 0:
        addscore = 2 * addscore_precision * addscore_recall / (addscore_precision + addscore_recall)

    return (keepscore, delscore_precision, addscore)

In [37]:
def SARIsent (ssent, csent, rsents) :
    numref = len(rsents)

    s1grams = ssent.lower().split(" ")
    c1grams = csent.lower().split(" ")
    s2grams = []
    c2grams = []
    s3grams = []
    c3grams = []
    s4grams = []
    c4grams = []

    r1gramslist = []
    r2gramslist = []
    r3gramslist = []
    r4gramslist = []
    for rsent in rsents:
        r1grams = rsent.lower().split(" ")
        r2grams = []
        r3grams = []
        r4grams = []
        r1gramslist.append(r1grams)
        for i in range(0, len(r1grams)-1) :
            if i < len(r1grams) - 1:
                r2gram = r1grams[i] + " " + r1grams[i+1]
                r2grams.append(r2gram)
            if i < len(r1grams)-2:
                r3gram = r1grams[i] + " " + r1grams[i+1] + " " + r1grams[i+2]
                r3grams.append(r3gram)
            if i < len(r1grams)-3:
                r4gram = r1grams[i] + " " + r1grams[i+1] + " " + r1grams[i+2] + " " + r1grams[i+3]
                r4grams.append(r4gram)
        r2gramslist.append(r2grams)
        r3gramslist.append(r3grams)
        r4gramslist.append(r4grams)

    for i in range(0, len(s1grams)-1) :
        if i < len(s1grams) - 1:
            s2gram = s1grams[i] + " " + s1grams[i+1]
            s2grams.append(s2gram)
        if i < len(s1grams)-2:
            s3gram = s1grams[i] + " " + s1grams[i+1] + " " + s1grams[i+2]
            s3grams.append(s3gram)
        if i < len(s1grams)-3:
            s4gram = s1grams[i] + " " + s1grams[i+1] + " " + s1grams[i+2] + " " + s1grams[i+3]
            s4grams.append(s4gram)

    for i in range(0, len(c1grams)-1) :
        if i < len(c1grams) - 1:
            c2gram = c1grams[i] + " " + c1grams[i+1]
            c2grams.append(c2gram)
        if i < len(c1grams)-2:
            c3gram = c1grams[i] + " " + c1grams[i+1] + " " + c1grams[i+2]
            c3grams.append(c3gram)
        if i < len(c1grams)-3:
            c4gram = c1grams[i] + " " + c1grams[i+1] + " " + c1grams[i+2] + " " + c1grams[i+3]
            c4grams.append(c4gram)


    (keep1score, del1score, add1score) = SARIngram(s1grams, c1grams, r1gramslist, numref)
    (keep2score, del2score, add2score) = SARIngram(s2grams, c2grams, r2gramslist, numref)
    (keep3score, del3score, add3score) = SARIngram(s3grams, c3grams, r3gramslist, numref)
    (keep4score, del4score, add4score) = SARIngram(s4grams, c4grams, r4gramslist, numref)
    avgkeepscore = sum([keep1score,keep2score,keep3score,keep4score])/4
    avgdelscore = sum([del1score,del2score,del3score,del4score])/4
    avgaddscore = sum([add1score,add2score,add3score,add4score])/4
    finalscore = (avgkeepscore + avgdelscore + avgaddscore ) / 3

    return finalscore

# Data loading
Ensure `data.json` is in the same directory, or modify the path below.

In [38]:
import json
with open(DATA_PATH +'data.json') as f:
    j = json.load(f)

valq = [2, 7, 13, 17, 26, 34, 40, 46, 52, 58, 66]
tstq = [5, 12, 16, 22, 30, 36, 42, 48, 54, 61, 68]

# System output
Edit `process()` to use your system. It takes an array of sentences from a single abstract and returns an array of equal length with the adapted version of each (some potentially blank or with multiple sentences).

In [39]:
def process(source):

    # REPLACE THIS CODE
    target = []
    for sent in source:
        target.append(sent) # copy source as placeholder

    return target


# Compute scores

In [41]:
def scoreQuestions(qs, name):
    sarisum = 0
    sarin = 0
    for q in qs:
        for pmid, node in j['%d'%q].items():
            if pmid != 'question' and pmid != 'question_type':
                source = []
                refs = []
                for line in node['abstract']:
                    source.append(node['abstract'][line])
                    linerefs = []
                    for _, adpt in node['adaptations'].items():
                        linerefs.append(adpt.get(line, ''))
                    refs.append(linerefs)
                target = process(source)
                for i in range(len(source)):
                    sari = SARIsent(source[i], target[i], refs[i])
                    sarisum += sari
                    sarin += 1
    print("SARI for %s set: %f"% (name, sarisum/sarin))


In [40]:
df.columns

Index(['question_id', 'question', 'question_type', 'title', 'adaptations',
       'abstracts'],
      dtype='object')

In [42]:
scoreQuestions(valq, "validation")
scoreQuestions(tstq, "test")

SARI for validation set: 0.143607
SARI for test set: 0.164033


In [None]:
def processBart(source):
    target = []
    for sent in source:
        response = generate_outputs_Bart(sent)
        target.append(response)
    return target

def scoreQuestionsBart(qs, name):
    sarisum = 0
    sarin = 0
    for q in qs:
        for pmid, node in j['%d'%q].items():
            if pmid != 'question' and pmid != 'question_type':
                source = []
                refs = []
                for line in node['abstract']:
                    source.append(node['abstract'][line])
                    linerefs = []
                    for _, adpt in node['adaptations'].items():
                        linerefs.append(adpt.get(line, ''))
                    refs.append(linerefs)
                target = processBart(source)
                for i in range(len(source)):
                    sari = SARIsent(source[i], target[i], refs[i])
                    sarisum += sari
                    sarin += 1
    print("SARI for %s set: %f"% (name, sarisum/sarin))

scoreQuestionsBart(valq, "validation")
scoreQuestionsBart(tstq, "test")