<a href="https://colab.research.google.com/github/Talida-M/BIO_NLP_PLABA_2023/blob/main/eval_sari_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mistral
## Imports


In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
!pip install rouge_score
!pip install datasets
!pip install numpy
!pip install torch
!pip install nltk
!pip install sacrebleu sacremoses

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=f962d6eae33dc55a0bd12cff261c5784a984ce4b62757341794cfcfb306ba6f9
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_c

In [3]:
import pandas as pd
import json
import os
import unicodedata
import math
import argparse
import random


from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Loading base Mistral model


In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inferenc

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


In [5]:
COLAB_ENABLED=True

if COLAB_ENABLED:
    # DATA_PATH = "/content/drive/MyDrive/biomedical_nlp/data"
    DATA_PATH = "/content/drive/MyDrive/BIO_NLP/"
    from google.colab import drive
    drive.mount('/content/drive')
    #%% md
else:
    DATA_PATH = "./data"

# Load the dataset
with open(DATA_PATH + '/data.json', 'r') as f:
    data = json.load(f)

print(json.dumps(data, indent=4))

Mounted at /content/drive


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:

dfs = []
question_data = {"question_id": [], "question": [], "question_type": [], "title": "", "adaptations": [], "abstracts": []}
for key, value in data.items():
    abstracts = []
    adaptations = []
    for sub_key, sub_value in value.items():
        if isinstance(sub_value, dict):
            for adaptation in sub_value['adaptations']:
                for adaptation_key, adaptation_value in sub_value['adaptations'][adaptation].items():
                    adaptations.append(adaptation_value)
                    abstracts.append(sub_value['abstract'][adaptation_key])
        question_data["abstracts"] = abstracts
        question_data["adaptations"] = adaptations
        question_data["question_id"] = [key] * len(adaptations)
        question_data["question_type"] = [value["question_type"]] * len(adaptations)
        question_data["question"] = [value["question"]] * len(adaptations)
        question_data["title"] = [value["question"]] * len(adaptations)
        assert len(adaptations) == len(abstracts), f"len {len(adaptations)} not equal {len(abstracts)}"
    dfs.append(pd.DataFrame.from_dict(question_data))

df = pd.concat(dfs)
df.to_csv(DATA_PATH + '/test_dataset.csv', index=False)
# print(df.head())
print(df['question'].head())


0    What causes muscle spasm?
1    What causes muscle spasm?
2    What causes muscle spasm?
3    What causes muscle spasm?
4    What causes muscle spasm?
Name: question, dtype: object


In [7]:
## Split up dataset into train/val/test -> 70/15/15
path = DATA_PATH
# print("Unique questions in dataset:", df['question'].unique())

# Clean the question column
df['question_id'] = df['question_id'].astype(str)

test_question_numbers = ['5','12','16','22','30','36','42','48','54','61','68']
val_question_numbers = ['2','7','13','17','26','34','40','46','52','58','66']
train_question_numbers = [str(x) for x in range(1, 76) if str(x) not in test_question_numbers and str(x) not in val_question_numbers]



test = df.loc[df['question_id'].isin(test_question_numbers)]
val = df.loc[df['question_id'].isin(val_question_numbers)]
train = df.loc[df['question_id'].isin(train_question_numbers)]

print("Train question numbers:", train_question_numbers)
print("Number of entries in test set:", len(test))
print("Number of entries in val set:", len(val))
print("Number of entries in train set:", len(train))


dfs = {'train':train, 'val':val, 'test':test}

    # Save each to CSV file
for key, df in dfs.items():
    df.to_csv(path + key + ".csv", index=False, encoding='utf-8-sig')
else:
    train = pd.read_csv(path + 'train.csv', header=0)
    val = pd.read_csv(path + 'val.csv', header=0)
    test = pd.read_csv(path + 'test.csv', header=0)
    datasets = {'train':train, 'val':val, 'test':test}


Train question numbers: ['1', '3', '4', '6', '8', '9', '10', '11', '14', '15', '18', '19', '20', '21', '23', '24', '25', '27', '28', '29', '31', '32', '33', '35', '37', '38', '39', '41', '43', '44', '45', '47', '49', '50', '51', '53', '55', '56', '57', '59', '60', '62', '63', '64', '65', '67', '69', '70', '71', '72', '73', '74', '75']
Number of entries in test set: 1373
Number of entries in val set: 1458
Number of entries in train set: 6488


In [8]:
df.head()
# datasets['train'].head()

Unnamed: 0,question_id,question,question_type,title,adaptations,abstracts
0,5,How to treat a bakers cyst?,C,How to treat a bakers cyst?,"Popliteal cysts, or Baker's cysts, are abnorm...",Objective: To review the results of arthroscop...
1,5,How to treat a bakers cyst?,C,How to treat a bakers cyst?,"Patients: From July 2007 to July 2009, we trea...","Patients: From July 2007 to July 2009, 11 pati..."
2,5,How to treat a bakers cyst?,C,How to treat a bakers cyst?,All of these patients had preoperative magneti...,All of them had preoperative magnetic resonanc...
3,5,How to treat a bakers cyst?,C,How to treat a bakers cyst?,We used the Rauschning and Lindgren criteria t...,We used the Rauschning and Lindgren criteria f...
4,5,How to treat a bakers cyst?,C,How to treat a bakers cyst?,We found that intra-articular pathology like c...,Results: Intra-articular pathology like cartil...


In [9]:
## Inference
prompt = """
### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(data):
    abstracts       = data["abstracts"]
    adaptations      = data["adaptations"]
    # print(abstracts)
    print()
    texts = []
    for input, output in zip(abstracts, adaptations):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(input, output) + EOS_TOKEN
        print(text)
        texts.append(text)
    return { "text" : texts, }


from datasets import Dataset, concatenate_datasets

dataset = concatenate_datasets([Dataset.from_pandas(datasets['train']), Dataset.from_pandas(datasets['val'])])
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/7946 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
### Input:
Because of high incidence of false positive RIAT results, cross antigenicity between human common cold coronaviruses and SARS-CoV-2 can be considered.

### Response:
The high rate of the falsely positive antibody test results might be due to similarities between the human common cold coronaviruses and the SARS-CoV-2 virus that causes COVID-19.</s>

### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
Results of RIAT should be interpreted in light of epidemics of human common cold coronaviruses infection.

### Response:
The results of the antibody tests may depend on epidemics of human common cold coronavirus infection.</s>

### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
Prevalence of p

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Finetuning using unsloth (RUN ONLY IF YOU WANT TO FINETUNE)


In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        # max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/7946 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,946 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 993
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.472
2,2.6829
3,2.4083
4,1.9759
5,1.8161
6,1.4746
7,1.1538
8,1.1532
9,0.9862
10,0.9541


### Saving the model locally

In [None]:
model.save_pretrained(DATA_PATH + "lora/lora_model_3") # Local saving
tokenizer.save_pretrained(DATA_PATH + "lora/lora_model_3")

## Inference

In [None]:
# Loading the model, change to True when loading. Be sure to have the lora_model folder in DATA_PATH
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = DATA_PATH + "/lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [23]:
def mistral_finetune_inference():
    ### Instruction:
    inference_prompt = """
    ### Instruction:
    You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

    ### Input:
    {}

    ### Response:
    {}"""

    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        inference_prompt.format(
            "We used spectral-domain optical coherence tomography to image macular regions and measure retinal thickness and Snellen chart visual acuity (VA) to evaluate best-corrected VA (BCVA) at 1, 2, 3, 6, 9, and 12 months after vitrectomy.</s>", # input
            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract the response part
    response_start = decoded_output.find("### Response:") + len("### Response:")
    response = decoded_output[response_start:].strip()
    print(response)

    return response





Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



### Instruction:
You are a medical advisor that takes in a very abstract sentence and translates it in layman's terms, for average people to understand.

### Input:
We used spectral-domain optical coherence tomography to image macular regions and measure retinal thickness and Snellen chart visual acuity (VA) to evaluate best-corrected VA (BCVA) at 1, 2, 3, 6, 9, and 12 months after vitrectomy. 

### Response:
We used a special camera to image the macular region and measure retinal thickness and Snellen chart visual acuity (VA) to evaluate best-corrected VA (BCVA) at 1, 2, 3, 6, 9, and 12 months after vitrectomy.
We used a special camera to image the macular region and measure retinal thickness and Snellen chart visual acuity (VA) to evaluate best-corrected VA (BCVA) at 1, 2, 3, 6, 9, and 12 months after vitrectomy.


# Evaluation

## Sari functions
From https://github.com/cocoxu/simplification/blob/master/SARI.py

In [None]:
from collections import Counter
import sys

In [None]:
def ReadInFile (filename):

    with open(filename) as f:
        lines = f.readlines()
        lines = [x.strip() for x in lines]
    return lines

In [None]:
def SARIngram(sgrams, cgrams, rgramslist, numref):
    rgramsall = [rgram for rgrams in rgramslist for rgram in rgrams]
    rgramcounter = Counter(rgramsall)

    sgramcounter = Counter(sgrams)
    sgramcounter_rep = Counter()
    for sgram, scount in sgramcounter.items():
        sgramcounter_rep[sgram] = scount * numref

    cgramcounter = Counter(cgrams)
    cgramcounter_rep = Counter()
    for cgram, ccount in cgramcounter.items():
        cgramcounter_rep[cgram] = ccount * numref


    # KEEP
    keepgramcounter_rep = sgramcounter_rep & cgramcounter_rep
    keepgramcountergood_rep = keepgramcounter_rep & rgramcounter
    keepgramcounterall_rep = sgramcounter_rep & rgramcounter

    keeptmpscore1 = 0
    keeptmpscore2 = 0
    for keepgram in keepgramcountergood_rep:
        keeptmpscore1 += keepgramcountergood_rep[keepgram] / keepgramcounter_rep[keepgram]
        keeptmpscore2 += keepgramcountergood_rep[keepgram] / keepgramcounterall_rep[keepgram]
        #print "KEEP", keepgram, keepscore, cgramcounter[keepgram], sgramcounter[keepgram], rgramcounter[keepgram]
    keepscore_precision = 0
    if len(keepgramcounter_rep) > 0:
    	keepscore_precision = keeptmpscore1 / len(keepgramcounter_rep)
    keepscore_recall = 0
    if len(keepgramcounterall_rep) > 0:
    	keepscore_recall = keeptmpscore2 / len(keepgramcounterall_rep)
    keepscore = 0
    if keepscore_precision > 0 or keepscore_recall > 0:
        keepscore = 2 * keepscore_precision * keepscore_recall / (keepscore_precision + keepscore_recall)


    # DELETION
    delgramcounter_rep = sgramcounter_rep - cgramcounter_rep
    delgramcountergood_rep = delgramcounter_rep - rgramcounter
    delgramcounterall_rep = sgramcounter_rep - rgramcounter
    deltmpscore1 = 0
    deltmpscore2 = 0
    for delgram in delgramcountergood_rep:
        deltmpscore1 += delgramcountergood_rep[delgram] / delgramcounter_rep[delgram]
        deltmpscore2 += delgramcountergood_rep[delgram] / delgramcounterall_rep[delgram]
    delscore_precision = 0
    if len(delgramcounter_rep) > 0:
    	delscore_precision = deltmpscore1 / len(delgramcounter_rep)
    delscore_recall = 0
    if len(delgramcounterall_rep) > 0:
    	delscore_recall = deltmpscore1 / len(delgramcounterall_rep)
    delscore = 0
    if delscore_precision > 0 or delscore_recall > 0:
        delscore = 2 * delscore_precision * delscore_recall / (delscore_precision + delscore_recall)


    # ADDITION
    addgramcounter = set(cgramcounter) - set(sgramcounter)
    addgramcountergood = set(addgramcounter) & set(rgramcounter)
    addgramcounterall = set(rgramcounter) - set(sgramcounter)

    addtmpscore = 0
    for addgram in addgramcountergood:
        addtmpscore += 1

    addscore_precision = 0
    addscore_recall = 0
    if len(addgramcounter) > 0:
    	addscore_precision = addtmpscore / len(addgramcounter)
    if len(addgramcounterall) > 0:
    	addscore_recall = addtmpscore / len(addgramcounterall)
    addscore = 0
    if addscore_precision > 0 or addscore_recall > 0:
        addscore = 2 * addscore_precision * addscore_recall / (addscore_precision + addscore_recall)

    return (keepscore, delscore_precision, addscore)

In [None]:
def SARIsent (ssent, csent, rsents) :
    numref = len(rsents)

    s1grams = ssent.lower().split(" ")
    c1grams = csent.lower().split(" ")
    s2grams = []
    c2grams = []
    s3grams = []
    c3grams = []
    s4grams = []
    c4grams = []

    r1gramslist = []
    r2gramslist = []
    r3gramslist = []
    r4gramslist = []
    for rsent in rsents:
        r1grams = rsent.lower().split(" ")
        r2grams = []
        r3grams = []
        r4grams = []
        r1gramslist.append(r1grams)
        for i in range(0, len(r1grams)-1) :
            if i < len(r1grams) - 1:
                r2gram = r1grams[i] + " " + r1grams[i+1]
                r2grams.append(r2gram)
            if i < len(r1grams)-2:
                r3gram = r1grams[i] + " " + r1grams[i+1] + " " + r1grams[i+2]
                r3grams.append(r3gram)
            if i < len(r1grams)-3:
                r4gram = r1grams[i] + " " + r1grams[i+1] + " " + r1grams[i+2] + " " + r1grams[i+3]
                r4grams.append(r4gram)
        r2gramslist.append(r2grams)
        r3gramslist.append(r3grams)
        r4gramslist.append(r4grams)

    for i in range(0, len(s1grams)-1) :
        if i < len(s1grams) - 1:
            s2gram = s1grams[i] + " " + s1grams[i+1]
            s2grams.append(s2gram)
        if i < len(s1grams)-2:
            s3gram = s1grams[i] + " " + s1grams[i+1] + " " + s1grams[i+2]
            s3grams.append(s3gram)
        if i < len(s1grams)-3:
            s4gram = s1grams[i] + " " + s1grams[i+1] + " " + s1grams[i+2] + " " + s1grams[i+3]
            s4grams.append(s4gram)

    for i in range(0, len(c1grams)-1) :
        if i < len(c1grams) - 1:
            c2gram = c1grams[i] + " " + c1grams[i+1]
            c2grams.append(c2gram)
        if i < len(c1grams)-2:
            c3gram = c1grams[i] + " " + c1grams[i+1] + " " + c1grams[i+2]
            c3grams.append(c3gram)
        if i < len(c1grams)-3:
            c4gram = c1grams[i] + " " + c1grams[i+1] + " " + c1grams[i+2] + " " + c1grams[i+3]
            c4grams.append(c4gram)


    (keep1score, del1score, add1score) = SARIngram(s1grams, c1grams, r1gramslist, numref)
    (keep2score, del2score, add2score) = SARIngram(s2grams, c2grams, r2gramslist, numref)
    (keep3score, del3score, add3score) = SARIngram(s3grams, c3grams, r3gramslist, numref)
    (keep4score, del4score, add4score) = SARIngram(s4grams, c4grams, r4gramslist, numref)
    avgkeepscore = sum([keep1score,keep2score,keep3score,keep4score])/4
    avgdelscore = sum([del1score,del2score,del3score,del4score])/4
    avgaddscore = sum([add1score,add2score,add3score,add4score])/4
    finalscore = (avgkeepscore + avgdelscore + avgaddscore ) / 3

    return finalscore

# Data loading
Ensure `data.json` is in the same directory, or modify the path below.

In [None]:
import json
with open('data.json') as f:
    j = json.load(f)

valq = [2, 7, 13, 17, 26, 34, 40, 46, 52, 58, 66]
tstq = [5, 12, 16, 22, 30, 36, 42, 48, 54, 61, 68]

# System output
Edit `process()` to use your system. It takes an array of sentences from a single abstract and returns an array of equal length with the adapted version of each (some potentially blank or with multiple sentences).

In [None]:
def process(source):

    # REPLACE THIS CODE
    target = []
    for sent in source:
        target.append(sent) # copy source as placeholder

    return target

# Compute scores

In [None]:
def scoreQuestions(qs, name):
    sarisum = 0
    sarin = 0
    for q in qs:
        for pmid, node in j['%d'%q].items():
            if pmid != 'question' and pmid != 'question_type':
                source = []
                refs = []
                for line in node['abstract']:
                    source.append(node['abstract'][line])
                    linerefs = []
                    for _, adpt in node['adaptations'].items():
                        linerefs.append(adpt.get(line, ''))
                    refs.append(linerefs)
                target = process(source)
                for i in range(len(source)):
                    sari = SARIsent(source[i], target[i], refs[i])
                    sarisum += sari
                    sarin += 1
    print("SARI for %s set: %f"% (name, sarisum/sarin))

In [None]:
scoreQuestions(valq, "validation")
scoreQuestions(tstq, "test")

SARI for validation set: 0.143607
SARI for test set: 0.164033
