In [84]:
import evaluate
from sklearn.metrics import classification_report
import json
import pandas as pd
import numpy as np

In [44]:
bleu = evaluate.load("bleu")
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor')
bertscore = evaluate.load('bertscore')

[nltk_data] Downloading package wordnet to /home/cse/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/cse/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/cse/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
target_names = ['Negative', 'Positive']

In [20]:
def get_frugal_icl_results(attribution_method, model_name, dataset_name, threshold):
    with open(f'responses/{attribution_method}/{model_name}_{dataset_name}_test_{threshold}.json') as data_file:
        d = json.load(data_file)
    test_data = pd.DataFrame.from_dict(d)
    print(classification_report(test_data['label'].tolist(), test_data['pred'].tolist(), target_names=target_names, digits=4))

In [21]:
get_frugal_icl_results('globenc', 'llama3_8b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9506    0.9315    0.9409       496
    Positive     0.9339    0.9524    0.9430       504

    accuracy                         0.9420      1000
   macro avg     0.9422    0.9419    0.9420      1000
weighted avg     0.9422    0.9420    0.9420      1000



In [22]:
get_frugal_icl_results('globenc', 'llama3_8b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.9601    0.9214    0.9403       496
    Positive     0.9256    0.9623    0.9436       504

    accuracy                         0.9420      1000
   macro avg     0.9428    0.9418    0.9420      1000
weighted avg     0.9427    0.9420    0.9420      1000



In [23]:
get_frugal_icl_results('globenc', 'llama3_8b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.9542    0.8831    0.9173       496
    Positive     0.8928    0.9583    0.9244       504

    accuracy                         0.9210      1000
   macro avg     0.9235    0.9207    0.9208      1000
weighted avg     0.9233    0.9210    0.9209      1000



In [24]:
get_frugal_icl_results('decompx', 'llama3_8b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9576    0.9113    0.9339       496
    Positive     0.9167    0.9603    0.9380       504

    accuracy                         0.9360      1000
   macro avg     0.9371    0.9358    0.9359      1000
weighted avg     0.9370    0.9360    0.9360      1000



In [25]:
get_frugal_icl_results('decompx', 'llama3_8b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.8822    0.9214    0.9014       496
    Positive     0.9191    0.8790    0.8986       504

    accuracy                         0.9000      1000
   macro avg     0.9007    0.9002    0.9000      1000
weighted avg     0.9008    0.9000    0.9000      1000



In [26]:
get_frugal_icl_results('decompx', 'llama3_8b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.8333    0.9173    0.8733       496
    Positive     0.9097    0.8194    0.8622       504

    accuracy                         0.8680      1000
   macro avg     0.8715    0.8684    0.8678      1000
weighted avg     0.8718    0.8680    0.8677      1000



In [27]:
get_frugal_icl_results('globenc', 'llama3_70b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9606    0.9335    0.9468       496
    Positive     0.9363    0.9623    0.9491       504

    accuracy                         0.9480      1000
   macro avg     0.9484    0.9479    0.9480      1000
weighted avg     0.9483    0.9480    0.9480      1000



In [28]:
get_frugal_icl_results('globenc', 'llama3_70b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.9602    0.9234    0.9414       496
    Positive     0.9273    0.9623    0.9445       504

    accuracy                         0.9430      1000
   macro avg     0.9438    0.9428    0.9430      1000
weighted avg     0.9436    0.9430    0.9430      1000



In [29]:
get_frugal_icl_results('globenc', 'llama3_70b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.9559    0.9173    0.9362       496
    Positive     0.9218    0.9583    0.9397       504

    accuracy                         0.9380      1000
   macro avg     0.9388    0.9378    0.9380      1000
weighted avg     0.9387    0.9380    0.9380      1000



In [30]:
get_frugal_icl_results('decompx', 'llama3_70b', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9645    0.9315    0.9477       496
    Positive     0.9347    0.9663    0.9502       504

    accuracy                         0.9490      1000
   macro avg     0.9496    0.9489    0.9490      1000
weighted avg     0.9495    0.9490    0.9490      1000



In [31]:
get_frugal_icl_results('decompx', 'llama3_70b', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.8389    0.9657    0.8978       496
    Positive     0.9604    0.8175    0.8832       504

    accuracy                         0.8910      1000
   macro avg     0.8996    0.8916    0.8905      1000
weighted avg     0.9001    0.8910    0.8904      1000



In [32]:
get_frugal_icl_results('decompx', 'llama3_70b', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.7680    0.9677    0.8564       496
    Positive     0.9573    0.7123    0.8168       504

    accuracy                         0.8390      1000
   macro avg     0.8627    0.8400    0.8366      1000
weighted avg     0.8634    0.8390    0.8364      1000



In [34]:
get_frugal_icl_results('globenc', 'gpt3.5', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9249    0.9677    0.9458       496
    Positive     0.9667    0.9226    0.9442       504

    accuracy                         0.9450      1000
   macro avg     0.9458    0.9452    0.9450      1000
weighted avg     0.9460    0.9450    0.9450      1000



In [35]:
get_frugal_icl_results('globenc', 'gpt3.5', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.8905    0.9677    0.9275       496
    Positive     0.9653    0.8829    0.9223       504

    accuracy                         0.9250      1000
   macro avg     0.9279    0.9253    0.9249      1000
weighted avg     0.9282    0.9250    0.9249      1000



In [36]:
get_frugal_icl_results('globenc', 'gpt3.5', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.8805    0.9657    0.9212       496
    Positive     0.9627    0.8710    0.9146       504

    accuracy                         0.9180      1000
   macro avg     0.9216    0.9184    0.9179      1000
weighted avg     0.9219    0.9180    0.9178      1000



In [37]:
get_frugal_icl_results('decompx', 'gpt3.5', 'imdb', 80)

              precision    recall  f1-score   support

    Negative     0.9179    0.9698    0.9431       496
    Positive     0.9685    0.9147    0.9408       504

    accuracy                         0.9420      1000
   macro avg     0.9432    0.9422    0.9420      1000
weighted avg     0.9434    0.9420    0.9420      1000



In [38]:
get_frugal_icl_results('decompx', 'gpt3.5', 'imdb', 60)

              precision    recall  f1-score   support

    Negative     0.6444    0.9899    0.7806       496
    Positive     0.9790    0.4623    0.6280       504

    accuracy                         0.7240      1000
   macro avg     0.8117    0.7261    0.7043      1000
weighted avg     0.8130    0.7240    0.7037      1000



In [39]:
get_frugal_icl_results('decompx', 'gpt3.5', 'imdb', 50)

              precision    recall  f1-score   support

    Negative     0.5816    0.9919    0.7332       496
    Positive     0.9740    0.2976    0.4559       504

    accuracy                         0.6420      1000
   macro avg     0.7778    0.6448    0.5946      1000
weighted avg     0.7794    0.6420    0.5935      1000



In [85]:
def get_frugal_summarization_results(attribution_method, model_name, dataset_name, threshold):
    with open(f'responses/{attribution_method}/{model_name}_{dataset_name}_test_{threshold}.json') as data_file:
        d = json.load(data_file)
    predictions = [item['response'] for item in d]
    references = [item['prediction'][0]['text'] for item in d]
    results = bleu.compute(predictions=predictions, references=references)
    print(f'BLEU: {results["bleu"]}')
    results = rouge.compute(predictions=predictions, references=references) 
    print(f'ROUGE: {results}')
    results = meteor.compute(predictions=predictions, references=references)
    print(f'METEOR: {results}')
    results = bertscore.compute(predictions=predictions, references=references, lang="en")
    print(f'BERTScore: {np.mean(results["f1"])}')

In [86]:
get_frugal_summarization_results('globenc', 'llama3_70b', 'argilla_news', 80)

BLEU: 0.017948128546239235
ROUGE: {'rouge1': 0.23460841670210178, 'rouge2': 0.07089085579414067, 'rougeL': 0.19483147538634615, 'rougeLsum': 0.19498269339060065}
METEOR: {'meteor': 0.33059439984150774}
BERTScore: 0.8745496688485146


In [90]:
get_frugal_summarization_results('globenc', 'llama3_70b', 'argilla_news', 60)

BLEU: 0.0171869111462639
ROUGE: {'rouge1': 0.23122188902553448, 'rouge2': 0.06798170379120627, 'rougeL': 0.1917590229874923, 'rougeLsum': 0.19184000273533047}
METEOR: {'meteor': 0.3285932134564875}
BERTScore: 0.8740010212063789


In [93]:
get_frugal_summarization_results('globenc', 'llama3_70b', 'argilla_news', 50)

BLEU: 0.016155157665230038
ROUGE: {'rouge1': 0.22746207620655548, 'rouge2': 0.06508681481109056, 'rougeL': 0.18844514939557572, 'rougeLsum': 0.18849463270903832}
METEOR: {'meteor': 0.3162967357635006}
BERTScore: 0.872528976559639




In [87]:
get_frugal_summarization_results('globenc', 'gpt3.5', 'argilla_news', 80)

BLEU: 0.01699929231021046
ROUGE: {'rouge1': 0.17615889526871475, 'rouge2': 0.05637679577244342, 'rougeL': 0.14570444619069328, 'rougeLsum': 0.14582419333981983}
METEOR: {'meteor': 0.28478751198560026}
BERTScore: 0.874255013525486


In [88]:
get_frugal_summarization_results('globenc', 'gpt3.5', 'argilla_news', 60)

BLEU: 0.014755862361676258
ROUGE: {'rouge1': 0.17148573101135378, 'rouge2': 0.0517679593492343, 'rougeL': 0.14126719061990425, 'rougeLsum': 0.14128668591279725}
METEOR: {'meteor': 0.27779224766838884}
BERTScore: 0.8735281746387482


In [89]:
get_frugal_summarization_results('globenc', 'gpt3.5', 'argilla_news', 50)

BLEU: 0.013739119923559216
ROUGE: {'rouge1': 0.16994241736125992, 'rouge2': 0.04849352952612479, 'rougeL': 0.1400449179745582, 'rougeLsum': 0.1400992906823087}
METEOR: {'meteor': 0.2679574566874844}
BERTScore: 0.8727953990697861


In [95]:
get_frugal_summarization_results('globenc', 'llama3_8b', 'argilla_news', 80)

BLEU: 0.017142710198672913
ROUGE: {'rouge1': 0.2261160310425459, 'rouge2': 0.06829044832393733, 'rougeL': 0.18852998228676832, 'rougeLsum': 0.18861921918599842}
METEOR: {'meteor': 0.32967423971079013}
BERTScore: 0.8711479386091232


In [96]:
get_frugal_summarization_results('globenc', 'llama3_8b', 'argilla_news', 60)

BLEU: 0.015491487040759819
ROUGE: {'rouge1': 0.21968992201519322, 'rouge2': 0.06365783971337301, 'rougeL': 0.1811383545251049, 'rougeLsum': 0.1810604074314962}
METEOR: {'meteor': 0.3189375763283737}
BERTScore: 0.8698343048095704


In [97]:
get_frugal_summarization_results('globenc', 'llama3_8b', 'argilla_news', 50)

BLEU: 0.013161140620249413
ROUGE: {'rouge1': 0.2128560727769499, 'rouge2': 0.05693697827381397, 'rougeL': 0.1751515675146502, 'rougeLsum': 0.17522484899493818}
METEOR: {'meteor': 0.3032773810524129}
BERTScore: 0.8681045891046524
