#### Notebook to compute the correlation between the `factual_consistency` metric outputs and human annotated consistency scores on benchmark datasets

In [1]:
# Load the benchmark datasets
import json

# These files were copied from the UniEval repo
# (https://github.com/maszhongming/UniEval/tree/main/reproduce/data/fact), which
# is a modified version of the dataset from https://github.com/W4ngatang/qags.
qags_xsum_path = 'data/qags_xsum.json'
qags_cnndm_path = 'data/qags_cnndm.json'

with open(qags_xsum_path) as f:
    qags_xsum_data = json.loads(f.read())
with open(qags_cnndm_path) as f:
    qags_cnndm_data = json.loads(f.read())

print(f'QAGS-XSUM has {len(qags_xsum_data)} data points')
print(f'QAGS-CNN has {len(qags_cnndm_data)} data points')

QAGS-XSUM has 239 data points
QAGS-CNN has 235 data points


In [2]:
# Extract the generated outputs, sources, and human annotated scores
qags_xsum_generated_outputs = [item['system_output'] for item in qags_xsum_data]
qags_xsum_sources = [item['source'] for item in qags_xsum_data]
qags_xsum_scores = [item['scores']['consistency'] for item in qags_xsum_data]

qags_cnndm_generated_outputs = [item['system_output'] for item in qags_cnndm_data]
qags_cnndm_sources = [item['source'] for item in qags_cnndm_data]
qags_cnndm_scores = [item['scores']['consistency'] for item in qags_cnndm_data]


In [3]:
from scipy.stats import spearmanr, pearsonr, kendalltau

def compute_correlation_values(result, annotated_scores):
    '''Function to compute and output the correlation values between the metric
    score and the human annotation scores.'''
    # Ignore any data points where the evaluator returned `None`. This may happen
    # if, for example, the prompt triggers Azure OpenAI's content filter.
    result_df = result.to_df()
    indices = list(result_df[result_df['metric_value'].notna()].index)
    valid_metric_values = [result.metric_values[i] for i in indices]
    valid_annotated_scores = [annotated_scores[i] for i in indices]

    pearson_corr = pearsonr(valid_metric_values, valid_annotated_scores)[0]
    spearman_corr = spearmanr(valid_metric_values, valid_annotated_scores)[0]
    kendalltau_corr = kendalltau(valid_metric_values, valid_annotated_scores)[0]

    print(f'Pearson correlation = {pearson_corr}')
    print(f'Spearman correlation = {spearman_corr}')
    print(f'Kendall-Tau correlation = {kendalltau_corr}')

In [19]:
# Compute the factual consistency scores on QAGS-XSUM using the local (UniEval)
# model option and measure various correlations with the human annotated scores
from langcheck.metrics import factual_consistency

result = factual_consistency(qags_xsum_generated_outputs, qags_xsum_sources)
compute_correlation_values(result, qags_xsum_scores)

# RUN-DATE: 2023-10-20
# Resulting correlation values:
#   Pearson correlation = 0.46449467052608684
#   Spearman correlation = 0.48161063910384716
#   Kendall-Tau correlation = 0.39405524553574556

Pearson correlation = 0.46449467052608684
Spearman correlation = 0.48161063910384716
Kendall-Tau correlation = 0.39405524553574556


In [None]:
# Compute the factual consistency scores on QAGS-XSUM using the OpenAI
# (gpt-3.5-turbo) model option and measure various correlations with the human
# annotated scores
from langcheck.metrics import factual_consistency
import os
from langcheck.metrics.eval_clients import AzureOpenAIEvalClient

os.environ["AZURE_OPENAI_KEY"] = 'YOUR_AZURE_OPENAI_KEY'
os.environ["OPENAI_API_VERSION"] = 'YOUR_OPENAI_API_VERSION'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'YOUR_AZURE_OPENAI_ENDPOINT'
client = AzureOpenAIEvalClient(text_model_name='YOUR_DEPLOYMENT_NAME')
result = factual_consistency(qags_xsum_generated_outputs,
                             qags_xsum_sources,
                             eval_model=client)
compute_correlation_values(result, qags_xsum_scores)

# RUN-DATE: 2023-10-20
# Azure OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# - API version: 2023-07-01-preview
# Resulting correlation values:
#   (Computed on 234 examples, since Azure's content filter rejected 5 prompts)
#   Pearson correlation = 0.31336126510584367
#   Spearman correlation = 0.3170456340335508
#   Kendall-Tau correlation = 0.3060538476722336

# RUN-DATE: 2023-10-27
# Azure OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# - API version: 2023-07-01-preview
# Resulting correlation values:
#   (Computed on 232 examples, since Azure's content filter rejected 7 prompts)
#   Pearson correlation = 0.3989261501556993
#   Spearman correlation = 0.3968648469619776
#   Kendall-Tau correlation = 0.3794472710898086

# RUN-DATE: 2023-12-05
# Azure OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# - API version: 2023-07-01-preview
# Resulting correlation values:
#   (Computed on 232 examples, since Azure's content filter rejected 7 prompts)
#   Pearson correlation = 0.4854067489664019
#   Spearman correlation = 0.4981601825844081
#   Kendall-Tau correlation = 0.47732669579571085

In [4]:
# Compute the factual consistency scores on QAGS-CNN using the local (UniEval)
# model option and measure various correlations with the human annotated scores
from langcheck.metrics import factual_consistency

result = factual_consistency(qags_cnndm_generated_outputs, qags_cnndm_sources)
compute_correlation_values(result, qags_cnndm_scores)

# RUN-DATE: 2023-10-20
# Resulting correlation values:
#   Pearson correlation = 0.6582265674108541
#   Spearman correlation = 0.6329252669621304
#   Kendall-Tau correlation = 0.5064287387727447

  from .autonotebook import tqdm as notebook_tqdm


Pearson correlation = 0.6582265674108541
Spearman correlation = 0.6329252669621304
Kendall-Tau correlation = 0.5064287387727447


In [None]:
# Compute the factual consistency scores on QAGS-CNN using the OpenAI
# (gpt-3.5-turbo) model option and measure various correlations with the human
# annotated scores
from langcheck.metrics import factual_consistency
import os
from langcheck.metrics.eval_clients import AzureOpenAIEvalClient

os.environ["AZURE_OPENAI_KEY"] = 'YOUR_AZURE_OPENAI_KEY'
os.environ["OPENAI_API_VERSION"] = 'YOUR_OPENAI_API_VERSION'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'YOUR_AZURE_OPENAI_ENDPOINT'
client = AzureOpenAIEvalClient(text_model_name='YOUR_DEPLOYMENT_NAME')

result = factual_consistency(qags_cnndm_generated_outputs,
                             qags_cnndm_sources,
                             eval_model=client)
compute_correlation_values(result, qags_cnndm_scores)

# RUN-DATE: 2023-10-20
# Azure OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# - API version: 2023-07-01-preview
# Resulting correlation values:
#   (Computed on 217 examples, since Azure's content filter rejected 18 prompts)
#   Pearson correlation = 0.41706624916880464
#   Spearman correlation = 0.37161022292902374
#   Kendall-Tau correlation = 0.31784727756463294

# RUN-DATE: 2023-10-27
# Azure OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# - API version: 2023-07-01-preview
# Resulting correlation values:
#   (Computed on 210 examples, since Azure's content filter rejected 25 prompts)
#   Pearson correlation = 0.6694263995180044
#   Spearman correlation = 0.592642518527631
#   Kendall-Tau correlation = 0.5244031673150222

# RUN-DATE: 2023-12-05
# Azure OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# - API version: 2023-07-01-preview
# Resulting correlation values:
#   (Computed on 218 examples, since Azure's content filter rejected 17 prompts)
#   Pearson correlation = 0.4967616460898673
#   Spearman correlation = 0.45082039363375137
#   Kendall-Tau correlation = 0.41909197655671315