In [1]:
!pip install -q torch transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets

In [2]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets

pd.set_option("display.max_colwidth", None)

In [5]:
from huggingface_hub import InferenceClient


repo_id = "human-centered-summarization/financial-summarization-pegasus"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
)


def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1000},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["summary_text"]


call_llm(llm_client, "This is a test context")

'Watch Watch this video to find out what happens when a test fails.'

In [7]:
from datasets import load_dataset
import evaluate

In [8]:
dataset = load_dataset("ccdv/arxiv-summarization")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [9]:
train_dataset = load_dataset("ccdv/arxiv-summarization", split="train")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [15]:
train_dataset.features["article"]

Value(dtype='string', id=None)

In [19]:
import evaluate

In [22]:
rouge = evaluate.load('rouge')
bert = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [34]:
summarization_metrics = []
for i, data in enumerate(train_dataset):
    summarization_score = {}
    article = data["article"]
    if "revenue" not in article:
        continue
    abstract = data["abstract"]
    llm_response = call_llm(llm_client, article)
    summarization_score["context"] = article
    summarization_score["groundtruth_summary"] = abstract
    summarization_score["llm_summary"] = llm_response
    bert_results = bert.compute(predictions=[llm_response], references=[abstract], lang="en")
    rouge_results = rouge.compute(predictions=[llm_response], references=[abstract])
    summarization_score["rouge_score_1"] = rouge_results["rouge1"] 
    summarization_score["rouge_score_2"] = rouge_results["rouge2"]
    summarization_score["rouge_score_L"] = rouge_results["rougeL"]
    summarization_score["rouge_score_Lsum"] = rouge_results["rougeLsum"]
    summarization_score["precision"] = bert_results["precision"][0]
    summarization_score["recall"] = bert_results["recall"][0]
    summarization_score["f1"] = bert_results["f1"][0] 
    summarization_metrics.append(summarization_score)
    if i == 2:
        break


In [35]:
summarization_df = pd.DataFrame(summarization_metrics)

In [36]:
summarization_df.head()

In [32]:
summarization_df.to_csv("summarization_results.csv", index=False)

In [None]:
groundtruth_critique_prompt = """You will be given an answer and a gound truth.
Your task is to provide a 'total rating' scoring how well does the llm prediction match the ground truth.
Give your answer on a scale of 0 to 1, where 0 means that the llm prediction has no resemblance to ground truth, and 1 means that the llm prediction exactly matches the ground truth.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 10)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the llm prediction and ground truth.

llm prediction: {llm_prediction}\n
ground truth: {ground_truth}\n
Answer::: """

In [None]:
df.columns

Index(['Unnamed: 0', 'input', 'extracted_answer', 'ground_truth', 'relevance',
       'Ground Truth'],
      dtype='object')

In [None]:
df['groundtruth_score']=""
df['groundtruth_eval']=""

In [None]:
for i in range(len(df)):
  evaluation=call_llm(
            llm_client,
            groundtruth_critique_prompt.format(llm_prediction=df['extracted_answer'][i],ground_truth=df['ground_truth'][i]),
        )
  df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
                float(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [None]:
for i in range(len(df)):
  evaluation=call_llm(
            llm_client,
            groundtruth_critique_prompt.format(llm_prediction=df['extracted_answer'][i],ground_truth=df['ground_truth'][i]),
        )
  try:
    df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
  except Exception as e:
        continue

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['groundtruth_score'][i], df['groundtruth_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [None]:
df

Unnamed: 0.1,Unnamed: 0,input,extracted_answer,ground_truth,relevance,Ground Truth,groundtruth_score,groundtruth_eval
0,0,"""Why did total deposits decrease?\n""","Total deposits decreased due to consumer deposit outflows on consumer spending and customer migration to higher yielding alternatives.""","'Total deposits decreased due to customer migration to higher yielding alternatives, partially offset by additions of deposits from new and existing customers.'",0.8,0.9,7.0,"The llm prediction correctly identifies the decrease in total deposits and the reason for this decrease due to customer migration to higher yielding alternatives. However, it fails to mention the offsetting factor of additions of deposits from new and existing customers, which is mentioned in the ground truth.\n"
1,1,"""What is the net income for the quarter ended June 30, 2022?\n""","The net income for the quarter ended June 30, 2022, is not provided in the given context. The provided context only includes financial information for the quarters ended September 30, 2022, and June 30, 2023, as well as the six months ended June 30, 2022, and 2023.""","'1,393'",0.2,1.0,3.0,"The llm prediction correctly identifies that the net income for the quarter ended June 30, 2022, is not provided in the given context. However, it does not provide any information about the ground truth value of '1,393'.\n"
2,2,"""What was the amount of TDRs at December 31, 2022?\n""","At December 31, 2022, the amount of TDRs was $9.2 billion.""","'TDRs totaled $9.2 billion at December 31, 2022.'",1.0,1.0,9.0,"The llm prediction and ground truth are identical in meaning, and the wording is very similar. The only difference is that the llm prediction uses the phrase 'the amount of TDRs' while the ground truth uses the phrase 'TDRs totaled'.\n"
3,3,"""What is the location of the Company's consolidated financial statements?\n""","The location of the Company's consolidated financial statements can be found in Part I, Financial Information, of the Forms 10-Q and 10-K filed with the SEC. Specifically, the statements can be found under Item 1, Financial Statements, on pages 56-60 of Form 10-Q and pages 58-63 of Form 10-K for the quarter ended March 31, 2023. Alternatively, the statements can also be found in the Company's Annual Report to Shareholders under \""Financial Review\"" and \""Financial Statements.\""""","""The Company's consolidated financial statements are set forth in the 2023 Annual Report to Shareholders.""",0.8,1.0,2.0,"The llm prediction is not accurate as it points to the wrong location of the consolidated financial statements. The ground truth clearly states that the consolidated financial statements are in the 2023 Annual Report to Shareholders, while the llm prediction points to the Forms 10-Q and 10-K filed with the SEC.\n"
4,4,"""How much did investment banking fees increase in the third quarter of 2023 compared to the third quarter of 2022?\n""","Investment banking fees increased by an amount that is not specified in the given context. However, it is mentioned that the increase was due to increased activity across all products in the third quarter of 2023 compared to the third quarter of 2022.""",'Investment banking fees increased by 117 million dollars in the third quarter of 2023 compared to the third quarter of 2022.',0.8,0.8,5.0,"The llm prediction is partially correct in that it identifies the increase in investment banking fees and the time period of the increase. However, it does not provide a specific amount for the increase, which is a key piece of information in the ground truth.\n"
5,5,"""Which court is the interchange litigation consolidated in?\n""","The interchange litigation is consolidated in the United States District Court for the Eastern District of New York.""",'The United States District Court for the Eastern District of New York',0.9,1.0,9.0,"The prediction is very close to the ground truth, but not exactly the same.\n\n"
6,6,"""What is the net interest income for the quarter ended March 31, 2022?\n""","The net interest income for the quarter ended March 31, 2022, is $ 12,098.""","'The net interest income for the quarter ended March 31, 2022 is $5,996.'",1.0,0.8,2.0,"The llm prediction is off by $6,102, which is a significant difference.\n"
7,7,"""What is the outstanding balance of auto loans with a FICO score of 740 or higher as of September 30, 2023?\n""","The outstanding balance of auto loans with a FICO score of 740 or higher as of September 30, 2023, is $25,194 million, according to Table 5.11 provided in the context.""","'$8,003 million'",0.9,1.0,2.0,"The llm prediction is off by a large margin from the ground truth, with the llm prediction being $17,191 million higher than the ground truth.\n"
8,8,"""What is the stress capital buffer for the period October 1, 2022, through September 30, 2023?\n""","The stress capital buffer for the period October 1, 2022, through September 30, 2023, is 3.20%. The stress capital buffer for the period October 1, 2023, through September 30, 2024, is 2.90%.""",'3.20%',1.0,1.0,10.0,"The llm prediction matches the ground truth for the stress capital buffer for the period October 1, 2022, through September 30, 2023.\n"
9,9,"""What is the average age of the rail cars?\n""","The average age of the rail cars is 22 years.""",'The average age of the rail cars is 22 years.',1.0,1.0,10.0,The llm prediction matches the ground truth exactly.\n


In [None]:
relevance_critique_prompt = """You will be given a question and a llm reply.
Your task is to provide a 'total rating' scoring how well does the llm reply addresses the question being asked.
Give your answer on a scale of 1 to 10, where 1 means that the llm reply  does not answer the question at all, and 10 means that the llm reply exactly answers the question being asked.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 10)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the llm prediction and ground truth.

llm reply: {llm_reply}\n
question: {question}\n
Answer::: """

In [None]:
df['relevance_score']=""
df['relevance_eval']=""

In [None]:
df['input'][0]

'"Why did total deposits decrease?\\n"'

In [None]:
for i in range(len(df)):
  evaluation=call_llm(
            llm_client,
            relevance_critique_prompt.format(llm_reply=df['extracted_answer'][i],question=df['input'][i]),
        )
  try:
    df['relevance_score'][i], df['relevance_eval'][i] = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
  except Exception as e:
        continue

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['relevance_score'][i], df['relevance_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['relevance_score'][i], df['relevance_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['relevance_score'][i], df['relevance_eval'][i] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['re

In [None]:
df

Unnamed: 0.1,Unnamed: 0,input,extracted_answer,ground_truth,relevance,Ground Truth,groundtruth_score,groundtruth_eval,relevance_score,relevance_eval
0,0,"""Why did total deposits decrease?\n""","Total deposits decreased due to consumer deposit outflows on consumer spending and customer migration to higher yielding alternatives.""","'Total deposits decreased due to customer migration to higher yielding alternatives, partially offset by additions of deposits from new and existing customers.'",0.8,0.9,7.0,"The llm prediction correctly identifies the decrease in total deposits and the reason for this decrease due to customer migration to higher yielding alternatives. However, it fails to mention the offsetting factor of additions of deposits from new and existing customers, which is mentioned in the ground truth.\n",9.0,"The llm reply provides a clear and concise answer to the question, citing consumer deposit outflows on consumer spending and customer migration to higher yielding alternatives as the reasons for the decrease in total deposits.\n"
1,1,"""What is the net income for the quarter ended June 30, 2022?\n""","The net income for the quarter ended June 30, 2022, is not provided in the given context. The provided context only includes financial information for the quarters ended September 30, 2022, and June 30, 2023, as well as the six months ended June 30, 2022, and 2023.""","'1,393'",0.2,1.0,3.0,"The llm prediction correctly identifies that the net income for the quarter ended June 30, 2022, is not provided in the given context. However, it does not provide any information about the ground truth value of '1,393'.\n",7.0,"The llm reply correctly identifies that the requested information is not provided in the given context. It does not provide the requested information, but it does explain why it cannot provide the information.\n"
2,2,"""What was the amount of TDRs at December 31, 2022?\n""","At December 31, 2022, the amount of TDRs was $9.2 billion.""","'TDRs totaled $9.2 billion at December 31, 2022.'",1.0,1.0,9.0,"The llm prediction and ground truth are identical in meaning, and the wording is very similar. The only difference is that the llm prediction uses the phrase 'the amount of TDRs' while the ground truth uses the phrase 'TDRs totaled'.\n",10.0,"The llm reply directly answers the question being asked, providing the exact amount of TDRs at December 31, 2022.\n"
3,3,"""What is the location of the Company's consolidated financial statements?\n""","The location of the Company's consolidated financial statements can be found in Part I, Financial Information, of the Forms 10-Q and 10-K filed with the SEC. Specifically, the statements can be found under Item 1, Financial Statements, on pages 56-60 of Form 10-Q and pages 58-63 of Form 10-K for the quarter ended March 31, 2023. Alternatively, the statements can also be found in the Company's Annual Report to Shareholders under \""Financial Review\"" and \""Financial Statements.\""""","""The Company's consolidated financial statements are set forth in the 2023 Annual Report to Shareholders.""",0.8,1.0,2.0,"The llm prediction is not accurate as it points to the wrong location of the consolidated financial statements. The ground truth clearly states that the consolidated financial statements are in the 2023 Annual Report to Shareholders, while the llm prediction points to the Forms 10-Q and 10-K filed with the SEC.\n",10.0,"The llm reply provides a clear and specific answer to the question, giving the exact location of the Company's consolidated financial statements in the Forms 10-Q and 10-K filed with the SEC. The reply also provides alternative locations for the statements in the Company's Annual Report to Shareholders.\n"
4,4,"""How much did investment banking fees increase in the third quarter of 2023 compared to the third quarter of 2022?\n""","Investment banking fees increased by an amount that is not specified in the given context. However, it is mentioned that the increase was due to increased activity across all products in the third quarter of 2023 compared to the third quarter of 2022.""",'Investment banking fees increased by 117 million dollars in the third quarter of 2023 compared to the third quarter of 2022.',0.8,0.8,5.0,"The llm prediction is partially correct in that it identifies the increase in investment banking fees and the time period of the increase. However, it does not provide a specific amount for the increase, which is a key piece of information in the ground truth.\n",1.0,"The llm reply does not provide a specific amount for the increase in investment banking fees, so it does not answer the question being asked.\n"
5,5,"""Which court is the interchange litigation consolidated in?\n""","The interchange litigation is consolidated in the United States District Court for the Eastern District of New York.""",'The United States District Court for the Eastern District of New York',0.9,1.0,9.0,"The prediction is very close to the ground truth, but not exactly the same.\n\n",10.0,"The llm reply provides the exact name of the court where the interchange litigation is consolidated, which is the United States District Court for the Eastern District of New York.\n"
6,6,"""What is the net interest income for the quarter ended March 31, 2022?\n""","The net interest income for the quarter ended March 31, 2022, is $ 12,098.""","'The net interest income for the quarter ended March 31, 2022 is $5,996.'",1.0,0.8,2.0,"The llm prediction is off by $6,102, which is a significant difference.\n",10.0,"The llm reply provides the exact value of the net interest income for the quarter ended March 31, 2022, which is $12,098. Therefore, the llm reply answers the question being asked.\n"
7,7,"""What is the outstanding balance of auto loans with a FICO score of 740 or higher as of September 30, 2023?\n""","The outstanding balance of auto loans with a FICO score of 740 or higher as of September 30, 2023, is $25,194 million, according to Table 5.11 provided in the context.""","'$8,003 million'",0.9,1.0,2.0,"The llm prediction is off by a large margin from the ground truth, with the llm prediction being $17,191 million higher than the ground truth.\n",10.0,"The llm reply provides the exact information that was asked for in the question, with a specific date and FICO score range. The answer is clear and easy to understand.\n"
8,8,"""What is the stress capital buffer for the period October 1, 2022, through September 30, 2023?\n""","The stress capital buffer for the period October 1, 2022, through September 30, 2023, is 3.20%. The stress capital buffer for the period October 1, 2023, through September 30, 2024, is 2.90%.""",'3.20%',1.0,1.0,10.0,"The llm prediction matches the ground truth for the stress capital buffer for the period October 1, 2022, through September 30, 2023.\n",10.0,"The llm reply provides the stress capital buffer for the period October 1, 2022, through September 30, 2023, which is 3.20%. The answer is clear and specific, and it directly addresses the question being asked.\n"
9,9,"""What is the average age of the rail cars?\n""","The average age of the rail cars is 22 years.""",'The average age of the rail cars is 22 years.',1.0,1.0,10.0,The llm prediction matches the ground truth exactly.\n,10.0,"The llm reply provides a clear and direct answer to the question, which is the average age of the rail cars. The answer is accurate and relevant to the question.\n"


In [None]:
df.to_excel("score_check_3.xlsx")