In [1]:
import evaluate
from sklearn.metrics import classification_report
import json
import pandas as pd
import numpy as np
import re

In [2]:
bleu = evaluate.load("bleu")
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor')
bertscore = evaluate.load('bertscore')

[nltk_data] Downloading package wordnet to /home/cse/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/cse/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/cse/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
target_names = ['Negative', 'Positive']

In [5]:
def get_frugal_icl_results(attribution_method, model_name, dataset_name, threshold):
    with open(f'responses/{attribution_method}/{model_name}_{dataset_name}_test_{threshold}.json') as data_file:
        d = json.load(data_file)
    test_data = pd.DataFrame.from_dict(d)
    print(classification_report(test_data['label'].tolist(), test_data['pred'].tolist(), target_names=target_names, digits=4))

In [6]:
get_frugal_icl_results('globenc', 'llama3_8b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9506    0.9315    0.9409       496
    Positive     0.9339    0.9524    0.9430       504

    accuracy                         0.9420      1000
   macro avg     0.9422    0.9419    0.9420      1000
weighted avg     0.9422    0.9420    0.9420      1000



In [7]:
get_frugal_icl_results('globenc', 'llama3_8b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.9601    0.9214    0.9403       496
    Positive     0.9256    0.9623    0.9436       504

    accuracy                         0.9420      1000
   macro avg     0.9428    0.9418    0.9420      1000
weighted avg     0.9427    0.9420    0.9420      1000



In [8]:
get_frugal_icl_results('globenc', 'llama3_8b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.9542    0.8831    0.9173       496
    Positive     0.8928    0.9583    0.9244       504

    accuracy                         0.9210      1000
   macro avg     0.9235    0.9207    0.9208      1000
weighted avg     0.9233    0.9210    0.9209      1000



In [9]:
get_frugal_icl_results('decompx', 'llama3_8b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9576    0.9113    0.9339       496
    Positive     0.9167    0.9603    0.9380       504

    accuracy                         0.9360      1000
   macro avg     0.9371    0.9358    0.9359      1000
weighted avg     0.9370    0.9360    0.9360      1000



In [10]:
get_frugal_icl_results('decompx', 'llama3_8b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.8822    0.9214    0.9014       496
    Positive     0.9191    0.8790    0.8986       504

    accuracy                         0.9000      1000
   macro avg     0.9007    0.9002    0.9000      1000
weighted avg     0.9008    0.9000    0.9000      1000



In [11]:
get_frugal_icl_results('decompx', 'llama3_8b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.8333    0.9173    0.8733       496
    Positive     0.9097    0.8194    0.8622       504

    accuracy                         0.8680      1000
   macro avg     0.8715    0.8684    0.8678      1000
weighted avg     0.8718    0.8680    0.8677      1000



In [12]:
get_frugal_icl_results('globenc', 'llama3_70b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9606    0.9335    0.9468       496
    Positive     0.9363    0.9623    0.9491       504

    accuracy                         0.9480      1000
   macro avg     0.9484    0.9479    0.9480      1000
weighted avg     0.9483    0.9480    0.9480      1000



In [13]:
get_frugal_icl_results('globenc', 'llama3_70b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.9602    0.9234    0.9414       496
    Positive     0.9273    0.9623    0.9445       504

    accuracy                         0.9430      1000
   macro avg     0.9438    0.9428    0.9430      1000
weighted avg     0.9436    0.9430    0.9430      1000



In [14]:
get_frugal_icl_results('globenc', 'llama3_70b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.9559    0.9173    0.9362       496
    Positive     0.9218    0.9583    0.9397       504

    accuracy                         0.9380      1000
   macro avg     0.9388    0.9378    0.9380      1000
weighted avg     0.9387    0.9380    0.9380      1000



In [15]:
get_frugal_icl_results('decompx', 'llama3_70b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9645    0.9315    0.9477       496
    Positive     0.9347    0.9663    0.9502       504

    accuracy                         0.9490      1000
   macro avg     0.9496    0.9489    0.9490      1000
weighted avg     0.9495    0.9490    0.9490      1000



In [16]:
get_frugal_icl_results('decompx', 'llama3_70b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.8389    0.9657    0.8978       496
    Positive     0.9604    0.8175    0.8832       504

    accuracy                         0.8910      1000
   macro avg     0.8996    0.8916    0.8905      1000
weighted avg     0.9001    0.8910    0.8904      1000



In [17]:
get_frugal_icl_results('decompx', 'llama3_70b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.7680    0.9677    0.8564       496
    Positive     0.9573    0.7123    0.8168       504

    accuracy                         0.8390      1000
   macro avg     0.8627    0.8400    0.8366      1000
weighted avg     0.8634    0.8390    0.8364      1000



In [18]:
get_frugal_icl_results('globenc', 'gpt3.5', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9249    0.9677    0.9458       496
    Positive     0.9667    0.9226    0.9442       504

    accuracy                         0.9450      1000
   macro avg     0.9458    0.9452    0.9450      1000
weighted avg     0.9460    0.9450    0.9450      1000



In [19]:
get_frugal_icl_results('globenc', 'gpt3.5', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.8905    0.9677    0.9275       496
    Positive     0.9653    0.8829    0.9223       504

    accuracy                         0.9250      1000
   macro avg     0.9279    0.9253    0.9249      1000
weighted avg     0.9282    0.9250    0.9249      1000



In [20]:
get_frugal_icl_results('globenc', 'gpt3.5', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.8805    0.9657    0.9212       496
    Positive     0.9627    0.8710    0.9146       504

    accuracy                         0.9180      1000
   macro avg     0.9216    0.9184    0.9179      1000
weighted avg     0.9219    0.9180    0.9178      1000



In [21]:
get_frugal_icl_results('decompx', 'gpt3.5', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9179    0.9698    0.9431       496
    Positive     0.9685    0.9147    0.9408       504

    accuracy                         0.9420      1000
   macro avg     0.9432    0.9422    0.9420      1000
weighted avg     0.9434    0.9420    0.9420      1000



In [22]:
get_frugal_icl_results('decompx', 'gpt3.5', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.6444    0.9899    0.7806       496
    Positive     0.9790    0.4623    0.6280       504

    accuracy                         0.7240      1000
   macro avg     0.8117    0.7261    0.7043      1000
weighted avg     0.8130    0.7240    0.7037      1000



In [23]:
get_frugal_icl_results('decompx', 'gpt3.5', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.5816    0.9919    0.7332       496
    Positive     0.9740    0.2976    0.4559       504

    accuracy                         0.6420      1000
   macro avg     0.7778    0.6448    0.5946      1000
weighted avg     0.7794    0.6420    0.5935      1000



In [24]:
def get_frugal_summarization_results(attribution_method, model_name, dataset_name, threshold):
    print(f'{attribution_method} {model_name} {dataset_name} {threshold}')
    with open(f'responses/{attribution_method}/{model_name}_{dataset_name}_test_{threshold}.json') as data_file:
        d = json.load(data_file)
    predictions = [item['response'] for item in d]
    references = [item['prediction'][0]['text'] for item in d]
    results = bleu.compute(predictions=predictions, references=references)
    print(f'BLEU: {results["bleu"]}')
    results = rouge.compute(predictions=predictions, references=references) 
    print(f'ROUGE: {results}')
    results = meteor.compute(predictions=predictions, references=references)
    print(f'METEOR: {results}')
    results = bertscore.compute(predictions=predictions, references=references, lang="en")
    print(f'BERTScore: {np.mean(results["f1"])}\n')

In [25]:
get_frugal_summarization_results('globenc', 'llama3_70b', 'argilla_news', 80)
get_frugal_summarization_results('globenc', 'llama3_70b', 'argilla_news', 60)
get_frugal_summarization_results('globenc', 'llama3_70b', 'argilla_news', 50)
get_frugal_summarization_results('globenc', 'gpt3.5', 'argilla_news', 80)
get_frugal_summarization_results('globenc', 'gpt3.5', 'argilla_news', 60)
get_frugal_summarization_results('globenc', 'gpt3.5', 'argilla_news', 50)
get_frugal_summarization_results('globenc', 'llama3_8b', 'argilla_news', 80)
get_frugal_summarization_results('globenc', 'llama3_8b', 'argilla_news', 60)
get_frugal_summarization_results('globenc', 'llama3_8b', 'argilla_news', 50)
get_frugal_summarization_results('decompx', 'llama3_70b', 'argilla_news', 80)
get_frugal_summarization_results('decompx', 'llama3_70b', 'argilla_news', 60)
get_frugal_summarization_results('decompx', 'llama3_70b', 'argilla_news', 50)
get_frugal_summarization_results('decompx', 'gpt3.5', 'argilla_news', 80)
get_frugal_summarization_results('decompx', 'gpt3.5', 'argilla_news', 60)
get_frugal_summarization_results('decompx', 'gpt3.5', 'argilla_news', 50)
get_frugal_summarization_results('decompx', 'llama3_8b', 'argilla_news', 80)
get_frugal_summarization_results('decompx', 'llama3_8b', 'argilla_news', 60)
get_frugal_summarization_results('decompx', 'llama3_8b', 'argilla_news', 50)

globenc llama3_70b argilla_news 80
BLEU: 0.017948128546239235
ROUGE: {'rouge1': 0.23455718761726824, 'rouge2': 0.07101901318824658, 'rougeL': 0.19493332030536611, 'rougeLsum': 0.19487934130012355}
METEOR: {'meteor': 0.33059439984150774}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: 0.8745496668219567

globenc llama3_70b argilla_news 60
BLEU: 0.0171869111462639
ROUGE: {'rouge1': 0.23124904182118375, 'rouge2': 0.06825439443447955, 'rougeL': 0.19188161812476232, 'rougeLsum': 0.19168804361778413}
METEOR: {'meteor': 0.3285932134564875}
BERTScore: 0.8740010181665421

globenc llama3_70b argilla_news 50
BLEU: 0.016155157665230038
ROUGE: {'rouge1': 0.2276209376192755, 'rouge2': 0.06522830267478333, 'rougeL': 0.18866631946753093, 'rougeLsum': 0.18851731435092103}
METEOR: {'meteor': 0.3162967357635006}




BERTScore: 0.872528975367546

globenc gpt3.5 argilla_news 80
BLEU: 0.01699929231021046
ROUGE: {'rouge1': 0.17600392889659086, 'rouge2': 0.05647358563108383, 'rougeL': 0.14580242725061743, 'rougeLsum': 0.14567893392870018}
METEOR: {'meteor': 0.28478751198560026}
BERTScore: 0.8742550149559974

globenc gpt3.5 argilla_news 60
BLEU: 0.014755862361676258
ROUGE: {'rouge1': 0.17169515590478923, 'rouge2': 0.05182581739473373, 'rougeL': 0.141278150365646, 'rougeLsum': 0.14121414382562048}
METEOR: {'meteor': 0.27779224766838884}
BERTScore: 0.8735281760692597

globenc gpt3.5 argilla_news 50
BLEU: 0.013739119923559216
ROUGE: {'rouge1': 0.17011234203003361, 'rouge2': 0.048571691586557005, 'rougeL': 0.1400878415276276, 'rougeLsum': 0.14006509851062038}
METEOR: {'meteor': 0.2679574566874844}
BERTScore: 0.8727953988313675

globenc llama3_8b argilla_news 80
BLEU: 0.017142710198672913
ROUGE: {'rouge1': 0.22624924739237762, 'rouge2': 0.06854011788160275, 'rougeL': 0.18867764511705137, 'rougeLsum': 0.18852



BERTScore: 0.8702439113855361

decompx llama3_70b argilla_news 60
BLEU: 0.01455585402283058
ROUGE: {'rouge1': 0.2148065379049417, 'rouge2': 0.056859767299841234, 'rougeL': 0.17618901929304512, 'rougeLsum': 0.17587286324499643}
METEOR: {'meteor': 0.2924066542484425}
BERTScore: 0.8684349508285523

decompx llama3_70b argilla_news 50
BLEU: 0.013881187981165215
ROUGE: {'rouge1': 0.20879373128317472, 'rouge2': 0.05402196874458078, 'rougeL': 0.1708754665706862, 'rougeLsum': 0.17063575084441365}
METEOR: {'meteor': 0.2813652650731808}
BERTScore: 0.8664922308921814

decompx gpt3.5 argilla_news 80
BLEU: 0.03586259396527457
ROUGE: {'rouge1': 0.2678615804860499, 'rouge2': 0.08429438862165703, 'rougeL': 0.2245637765061497, 'rougeLsum': 0.22445999951705561}
METEOR: {'meteor': 0.3373148464613475}
BERTScore: 0.8880208370685577

decompx gpt3.5 argilla_news 60
BLEU: 0.030630541329386952
ROUGE: {'rouge1': 0.2532748872299238, 'rouge2': 0.072817210558634, 'rougeL': 0.2095540991786817, 'rougeLsum': 0.2094140

In [4]:
def find_numbers(x: str) -> list[str]:
  """Finds all numbers in a string."""
  # Search for number, possibly negative (hyphen), with thousand separators
  # (comma), and with a decimal point (period inbetween digits).
  numbers = re.compile(
      r'-?[\d,]*\.?\d+',
      # r'-?[\d,]*\.?\d+(?:[a-zA-Z]+)?',
      re.MULTILINE | re.DOTALL | re.IGNORECASE,
  ).findall(x)
  return numbers

def find_number(x: str,
                answer_delimiter: str = 'The answer is') -> str:
  """Finds the most relevant number in a string."""
  # If model uses the answer delimiter, then select the first number following
  # that format.
  if answer_delimiter in x:
    answer = x.split(answer_delimiter)[-1]
    numbers = find_numbers(answer)
    if numbers:
      return numbers[0]

  # In general, select the last number in the string.
  numbers = find_numbers(x)
  if numbers:
    return numbers[-1]
  return ''

def maybe_remove_comma(x: str) -> str:
  # Example: 5,600 -> 5600
  return x.replace(',', '')

In [5]:
def get_frugal_math_reasoning_results(attribution_method, model_name, dataset_name, threshold):
    with open(f'responses/{attribution_method}/{model_name}_{dataset_name}_test_{threshold}.json') as data_file:
        d = json.load(data_file)
    predictions = [maybe_remove_comma(find_number(item['response'], "<OUTPUT>")) for item in d]
    answers = [maybe_remove_comma(find_number(item['answer'], "####")) for item in d]
    correct = 0
    for i in range(len(predictions)):
        try:
            if float(predictions[i]) == float(answers[i]):
                correct += 1
        except:
            print(f'Error at {i}: {predictions[i]} {answers[i]} {d[i]}')
    print(f'Accuracy: {correct / len(predictions)}')    

In [39]:
get_frugal_math_reasoning_results('globenc', 'gpt3.5', 'gsm8k', 80)
get_frugal_math_reasoning_results('globenc', 'gpt3.5', 'gsm8k', 60)
get_frugal_math_reasoning_results('globenc', 'gpt3.5', 'gsm8k', 50)
get_frugal_math_reasoning_results('decompx', 'gpt3.5', 'gsm8k', 80)
get_frugal_math_reasoning_results('decompx', 'gpt3.5', 'gsm8k', 60)
get_frugal_math_reasoning_results('decompx', 'gpt3.5', 'gsm8k', 50)

Accuracy: 0.498
Accuracy: 0.264
Error at 590:  60 {'question': 'Martha has been collecting shells since she turned 5 years old, every month she collects one shell. By her 10th birthday, how many shells will Martha have collected?', 'answer': 'In 12 months of the year, Martha collects 12 * 1 = <<12*1=12>>12 shells.\nFrom her 5th to her 10th birthday, 10 - 5 = <<10-5=5>>5 years have passed.\nDuring the 5 years that have passed, Martha has collected 12 * 5 = <<12*5=60>>60 shells.\n#### 60', 'frugal_text': 'been collec shells she 5 years old every collec shell birthd how shells will have collec', 'prompt': 'As an expert problem solver solve step by step the following mathematical question.\nAfter performing the correct set of steps for solving the problem, please fill in the <OUTPUT> tag with your numerical answer.\n\nQuestion:\n\nbeen collec shells she 5 years old every collec shell birthd how shells will have collec\nAnswer: <OUTPUT>', 'response': 'It appears that the question provided i

In [7]:
get_frugal_math_reasoning_results('globenc', 'llama3_70b', 'gsm8k', 80)
get_frugal_math_reasoning_results('globenc', 'llama3_70b', 'gsm8k', 60)
get_frugal_math_reasoning_results('globenc', 'llama3_70b', 'gsm8k', 50)
get_frugal_math_reasoning_results('decompx', 'llama3_70b', 'gsm8k', 80)
get_frugal_math_reasoning_results('decompx', 'llama3_70b', 'gsm8k', 60)
get_frugal_math_reasoning_results('decompx', 'llama3_70b', 'gsm8k', 50)

Error at 8:  8 {'question': 'Joe has $50 to buy an outfit for his new field trip. There is a 30% off sale at the clothing store. The shirt he picks out has a price of $25. He also picks out a pair of shorts for $35. Assuming that sales tax is included, how much money will Joe have left after the purchase?', 'answer': 'The price of the two clothing items totals 35 + 25 = <<35+25=60>>60 dollars before discount.\nThe total sale discount comes to 60 * 0.3 = <<60*0.3=18>>18 dollars.\nThe final sale price comes to 60 - 18 = <<60-18=42>>42 dollars.\nJoe ends up with 50 - 42 = <<50-42=8>>8 dollars change.\n#### 8', 'frugal_text': 'Joe has $ 50 to buy an outfi for his new field trip There is 30 off at the clothin store The shirt he picks out has price of $ 25 also picks out pair of short for $ 35 Assumin that tax is include how much money will Joe have after the purchas', 'prompt': 'As an expert problem solver solve step by step the following mathematical question.\nAfter performing the correct

In [None]:
get_frugal_math_reasoning_results('globenc', 'llama3_8b', 'gsm8k', 80)
get_frugal_math_reasoning_results('globenc', 'llama3_8b', 'gsm8k', 60)
get_frugal_math_reasoning_results('globenc', 'llama3_8b', 'gsm8k', 50)
get_frugal_math_reasoning_results('decompx', 'llama3_8b', 'gsm8k', 80)
get_frugal_math_reasoning_results('decompx', 'llama3_8b', 'gsm8k', 60)
get_frugal_math_reasoning_results('decompx', 'llama3_8b', 'gsm8k', 50)