## Setup

In [1]:
# !pip install pytest
!pip install transformers
!pip install sentencepiece
!pip install tokenizers
!pip install nltk
!pip install loguru
!pip install rouge-score
!pip install peft

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m931.4 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.2
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=5bc6b4ab5356f435a20ea88d27adc59b08368e1aafddba30db99

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
%cd drive/MyDrive/projects/compositional-reasoning-finetuning

/content/drive/MyDrive/projects/compositional-reasoning-finetuning


# Run Evaluation

In [None]:
!python3 evaluation.py --model='t5-small' --strategy='self_ask' --no-examplars --no-answer_first --no-random_facts --no-load_checkpoint  --size=-1

In [None]:
!python3 evaluation.py --model='t5-small' --strategy='self_ask' --examplars --no-answer_first --no-random_facts --no-load_checkpoint  --size=-1

In [None]:
!python3 evaluation.py --model='t5-small' --strategy='self_ask' --examplars --answer_first --no-random_facts --no-load_checkpoint  --size=-1

In [None]:
!python3 evaluation.py --model='t5-small' --strategy='self_ask' --no-examplars --answer_first --no-random_facts --no-load_checkpoint  --size=-1

In [None]:
!python3 evaluation.py --model='t5-small' --strategy='self_ask' --examplars --no-answer_first --random_facts --no-load_checkpoint  --size=-1

In [None]:
!python3 evaluation.py --model='t5-small' --strategy='self_ask' --no-examplars --no-answer_first --random_facts --no-load_checkpoint  --size=-1

In [4]:
!python3 evaluation.py --model='t5-small' --strategy='direct' --no-examplars --no-answer_first --random_facts --no-load_checkpoint  --size=-1

2024-01-28 19:39:23.698816: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 19:39:23.698873: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 19:39:23.700341: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 19:39:23.708366: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[32m2024-01-28 19:39:38.570[0m | [34m[1mDEBUG   

In [None]:
!python3 evaluation.py --model='t5-small' --strategy='direct' --no-examplars --no-answer_first --no-random_facts --no-load_checkpoint  --size=-1

## Evaluation

In [None]:
from evaluation import load_responses, EvaluationConfig
from data_loaders import load_TestData
import json
config = EvaluationConfig("flan-t5-small-direct", examplars=False, data_path="data/MultihopEvaluation/", results_path="results/", create_tokenizer=False)
responses = load_responses(config)
test_set = load_TestData(config.generate_test_set_file(), n_examples=5)
with open(config.generate_results_file(), "r") as f:
    results = json.load(f)

In [None]:
for idx in range(5):
    print("Example:", idx)
    print(test_set[idx]["prompt"])
    display("Reference text:", test_set[idx]["target"])
    print("True answer:", test_set[idx]["answer"])
    print()
    print("--------")
    print("T5 response")
    display("Response:", responses[idx]["response"])
    print("Answer:", responses[idx]["answer"])
    print("--------")
    print("\n\n")
display("T5 Results")
display(results["macro_results"])

In [None]:
import pandas as pd

pd.DataFrame(results["micro_results"])

# Macro Evaluation

In [None]:
import os, json
import pandas as pd

path_to_json = './results'

json_files = [pos_json for pos_json in sorted(os.listdir(path_to_json)) if pos_json.endswith('.json')]

jsons_data = pd.DataFrame(columns=['Model', 'Finetune', 'With Examplars', 'Accuracy', 'F1-1', 'F1-2', 'BLEU-1', 'BLEU-2', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'])

for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        json_text = json.load(json_file)

        # Get rid of unnecessary filename and extension.
        js = js.replace("-results.json", "")

        # Mark "With Examplars" column
        if ("-with-examplars" in js):
            model = js.replace("-with-examplars", "")
            examplars = 'Y'
        else:
            model = js.replace("-without-examplars", "")
            examplars = 'N'

        # Mark "Finetune" column
        if ("-direct" in model):
            model = model.replace("-direct", "")
            fine_tune = "Direct"
        elif ("-self-ask" in model):
            model = model.replace("-self-ask", "")
            fine_tune = "Self Ask"
        else:
            fine_tune = "N/A"

        # Build data frame row
        accuracy = json_text['macro_results']['accuracy']
        f1_1 = json_text['macro_results']['F1-1']
        f1_2 = json_text['macro_results']['F1-2']
        bleu_1 = json_text['macro_results']['bleu-1']
        bleu_2 = json_text['macro_results']['bleu-2']
        rouge_1 = json_text['macro_results']['rouge-1']
        rouge_2 = json_text['macro_results']['rouge-2']
        rouge_L = json_text['macro_results']['rouge-L']

        # Add row to data frame
        jsons_data.loc[index] = [model, fine_tune, examplars, accuracy, f1_1, f1_2, bleu_1, bleu_2, rouge_1, rouge_2, rouge_L]

# print result
jsons_data




