# Popular title prediction comparison metrics

In [1]:
from pandas import read_csv
import evaluate

rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
meteor = evaluate.load('meteor')

2024-04-24 14:29:28.003226: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2024-04-24 14:29:29,909] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[nltk_data] Downloading package wordnet to /home/bashlab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/bashlab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/bashlab/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
def remove_start_end(data):
    words = data[0].split()
    if words[0]=="start" and words[-1]=="end":
        print("Removing start and end")
        old_data = data.copy()
        data = []
        for sample in old_data:
            sample = " ".join(sample.split()[1:-1])
            data.append(sample)
    return data

In [23]:
def compute_metrics(filepath, encoding='cp1252'):
    df = read_csv(filepath, encoding=encoding)
    references = remove_start_end(df["True Title"])
    predictions = remove_start_end(df["Predicted Title"])
    print(rouge.compute(predictions=predictions,references=references))
    print(bleu.compute(predictions=predictions,references=[[ref] for ref in references]))
    print(meteor.compute(predictions=predictions,references=references))

In [21]:
compute_metrics("lstm/LSTM_attention.csv")

10182
10182
{'rouge1': 0.1366115222470731, 'rouge2': 0.04254364167446087, 'rougeL': 0.13179486331567025, 'rougeLsum': 0.1318208908662652}
{'bleu': 0.036445868433343596, 'precisions': [0.1688180085410283, 0.06265792419638573, 0.034361192065431985, 0.03463917525773196], 'brevity_penalty': 0.6118439469006668, 'length_ratio': 0.6705657752107298, 'translation_length': 41447, 'reference_length': 61809}
{'meteor': 0.09324901248878967}


In [22]:
compute_metrics("lstm/LSTM_attention_stopwords.csv")

9838
9838
{'rouge1': 0.15158954967586952, 'rouge2': 0.04889780780488909, 'rougeL': 0.14419852127153018, 'rougeLsum': 0.14434043878556196}
{'bleu': 0.036447028298634054, 'precisions': [0.15408003706492152, 0.04847205807867635, 0.023869779118498573, 0.01753720971369368], 'brevity_penalty': 0.8667631546259996, 'length_ratio': 0.8748986623429267, 'translation_length': 69068, 'reference_length': 78944}
{'meteor': 0.11078987532467596}


In [26]:
compute_metrics("lstm/LSTM_pointer_generator.csv")

{'rouge1': 0.1429085850744704, 'rouge2': 0.04106810454799074, 'rougeL': 0.13470933453489325, 'rougeLsum': 0.1348892319191301}
{'bleu': 0.03166583165349626, 'precisions': [0.14564057307538505, 0.04218493030385285, 0.021559815689174585, 0.016730836541827093], 'brevity_penalty': 0.8207110764935666, 'length_ratio': 0.835014392630973, 'translation_length': 72521, 'reference_length': 86850}
{'meteor': 0.09750387387678204}


In [27]:
compute_metrics("lstm/LSTM_vanilla_data.csv")

{'rouge1': 0.1535061909525322, 'rouge2': 0.04740998192119289, 'rougeL': 0.14479962254003736, 'rougeLsum': 0.14479650082217893}
{'bleu': 0.03548716771466826, 'precisions': [0.15497928703804645, 0.0461686792691408, 0.02268793739182783, 0.016938156034942187], 'brevity_penalty': 0.8714666574092829, 'length_ratio': 0.8790608528988979, 'translation_length': 73384, 'reference_length': 83480}
{'meteor': 0.10596625515984237}


In [46]:
gpt_title_predictor_result_path = "data/predicted_titles.csv"
compute_metrics(gpt_title_predictor_result_path, encoding='utf8')

{'rouge1': 0.1609329257067545, 'rouge2': 0.03294434731374428, 'rougeL': 0.13704272906129478, 'rougeLsum': 0.13777026688182964}
{'bleu': 0.01992006815185011, 'precisions': [0.13701067615658363, 0.02734375, 0.008658008658008658, 0.0048543689320388345], 'brevity_penalty': 1.0, 'length_ratio': 1.174503657262278, 'translation_length': 1124, 'reference_length': 957}
{'meteor': 0.13661658349670927}


# Limitation of existing metrics

As we can see, typical summary comparison metrics that rely on n-gram or other lexical similarities fail when the output is creative but valid. For example, our fine-tuned GPT often shows compelling and cohesive titles that are quite different from reference titles. Therefore, apart from Rouge-1 and METEOR, the metrics don't beat other models, although the results are significantly better upon human observation. To tackle this limitation, we propose a new metric: "LLM Title Discriminator." It replaces human observation and tries to identify which title was originally used by the news agency. Lower Discriminator Accuracy indicates better titles.

# LLM Title Discriminator

## Fine-tuning Title Discriminator
We'll use 20 randomly picked samples to fine-tuned GPT 3.5 to build our LLM Title Discriminator so that it can correctly identify samples that clearly appear to be machine-generated.

In [3]:
import json
import math

import openai
import random


first_as_token_id = 3983
second_as_token_id = 5686

instruction = """
    You are given an abstract of a published real world news and a pair of possible titles. 
    One of the titles is the official title used by the news agency, which is generated 
    by a professional human. The other one is generated by a machine learning algorithm 
    and can be incorrect, incomplete, vague, or sound unprofessional or unnatural. 
    Your job is to identify whether the first or the second title 
    is the original human-generated title. 
    
    Answer with a single word: "first" or "second", nothing else.
"""


result_files = [
    "LSTM_attention_stopwords.csv",
    "LSTM_attention.csv",
    "LSTM_pointer_generator.csv",
    "LSTM_vanilla_data.csv"
]

def generate_discriminator_training_data():
    with open("discriminator_examples.jsonl", "w") as f:
        for filename in result_files:
            df = read_csv(filename, encoding='cp1252')
            df = df.sample(n=5)
            
            for item in df.iterrows():
                item=item[1]
                
                abstract = item["Abstract"]
                reference = item["True Title"]
                prediction = item["Predicted Title"]
                
                correct_answer = random.choice(["first","second"])
                if correct_answer == "first":
                    first = reference
                    second = prediction
                    incorrect_answer = "second"
                elif correct_answer == "second":
                    first = prediction
                    second = reference
                    incorrect_answer = "first"
                
                prompt = f"""
                    Abstract: {abstract}
                    First title: {first}
                    Second title: {second}
                """
            
                training_sample = {
                    "messages": [
                        {"role": "system", "content": instruction},
                        {"role": "user", "content": prompt},
                        {"role": "assistant", "content": correct_answer}
                    ]
                }
                
                training_json = json.dumps(training_sample)
                f.write(training_json+"\n")


generate_discriminator_training_data()

In [4]:
import time


client = openai.OpenAI()

class OpenAIFineTuner:
    """
    Class to fine tune OpenAI models
    """
    def __init__(self, training_file_path, model_name, suffix):
        self.training_file_path = training_file_path
        self.model_name = model_name
        self.suffix = suffix
        self.file_object = None
        self.fine_tuning_job = None
        self.model_id = None

    def create_openai_file(self):
        self.file_object = client.files.create(
            file=open(self.training_file_path, "rb"),
            purpose="fine-tune",
        )

    def wait_for_file_processing(self, sleep_time=20):
        print(self.file_object.status)
        while self.file_object.status != 'processed':
            time.sleep(sleep_time)
            print("File Status: ", self.file_object.status)

    def create_fine_tuning_job(self):
        self.fine_tuning_job = client.fine_tuning.jobs.create(
            training_file=self.file_object.id,
            model=self.model_name,
            suffix=self.suffix,
        )

    def wait_for_fine_tuning(self, sleep_time=45):
        while client.fine_tuning.jobs.retrieve(self.fine_tuning_job.id).status != 'succeeded':
            time.sleep(sleep_time)
            print("Job Status: ", client.fine_tuning.jobs.retrieve(self.fine_tuning_job.id).status)

    def retrieve_fine_tuned_model(self):
        self.model_id = client.fine_tuning.jobs.retrieve(self.fine_tuning_job.id).fine_tuned_model
        return self.model_id

    def fine_tune_model(self):
        self.create_openai_file()
        self.wait_for_file_processing()
        self.create_fine_tuning_job()
        self.wait_for_fine_tuning()
        return self.retrieve_fine_tuned_model()

fine_tuner = OpenAIFineTuner(
    training_file_path="discriminator_examples.jsonl",
    model_name="gpt-3.5-turbo-1106",
    suffix="discriminator"
)

In [27]:
fine_tuner.fine_tune_model()
model_id = fine_tuner.model_id
print(model_id)

processed
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  succeeded
ft:gpt-3.5-turbo-1106:worcester-polytechnic-institute:discriminator:9CX6Fawj


In [5]:
finetuned_model_id = "ft:gpt-3.5-turbo-1106:worcester-polytechnic-institute:discriminator:9CX6Fawj"

## Title Discriminator Metric
Unless the reference and prediction are an exact match, the Title Discriminator model should attempt to identify which title is machine-generated. Higher Discriminator Accuracy indicates the weakness of titles. 

In [6]:
def discriminate_title_pair_with_llm(abstract,reference,prediction,model_id):
    if reference.strip()==prediction.strip():
        """
        For exact match, skip LLM discriminator
        """
        return 0, 1.0
    
    correct_answer = random.choice(["first","second"])
    if correct_answer == "first":
        first = reference
        second = prediction
        incorrect_answer = "second"
    elif correct_answer == "second":
        first = prediction
        second = reference
        incorrect_answer = "first"
    
    
    prompt = f"""
        Abstract: {abstract}
        First title: {first}
        Second title: {second}
    """


    completion = client.chat.completions.create(
        model=model_id,
        messages=[
            {"role": "system", "content": instruction},
            {"role": "user", "content": prompt}
        ],
        logprobs=True,
        logit_bias={first_as_token_id: 100, second_as_token_id: 100},
        max_tokens=1,
        seed=42
    )

    model_completion = completion.choices[0].logprobs
    json_response = json.loads(model_completion.json())
    logprob = json_response["content"][0]["logprob"]
    token = json_response["content"][0]["token"].lower()
    probability = math.exp(logprob)


    if token == correct_answer:
        correct_flag = 1
    elif token == incorrect_answer:
        correct_flag = 0
    else:
        raise RuntimeError(f"{json_response} is unexpected")
    
    return correct_flag, probability

In [24]:
import numpy as np 
from tqdm import tqdm


def llm_as_discriminator(
    filepath,
    model_id,
    max_infer=1000,
    encoding='cp1252',
    break_if_incorrect=False
):
    df = read_csv(filepath, encoding=encoding)
    df = df.sample(n=max_infer)
    total_correct = 0
    probabilities = {
        "correct": [],
        "incorrect": []
    }
    for item in tqdm(df.iterrows()):
        item = item[1]
        abstract = item["Abstract"]
        reference = item["True Title"]
        if reference.split()[0]=="start" and reference.split()[-1]=="end":
            reference = reference[1:-1]
        prediction = item["Predicted Title"]
        if prediction.split()[0]=="start" and prediction.split()[-1]=="end":
            prediction = prediction[1:-1]
        correct_flag, probability = discriminate_title_pair_with_llm(
            abstract,reference,prediction,model_id
        )
        total_correct += correct_flag
        if correct_flag==1:
            probabilities["correct"].append(probability)
        elif correct_flag==0:
            probabilities["incorrect"].append(probability)
            if break_if_incorrect is True:
                print(
                    "Confused example:\n"
                    f"Abstract: {abstract}\n"
                    f"Reference: {reference}\n"
                    f"Prediction: {prediction}"
                )
                return
        else:
            raise RuntimeError(f"{correct_flag} is unexpected")
    print(f"Discrimination accuracy: {100*total_correct/max_infer}")
    print(f"Average confidence of correct discrimination: {np.mean(probabilities['correct'])}")
    print(f"Average confidence of incorrect discrimination: {np.mean(probabilities['incorrect'])}")

## Title Discriminator without Fine-tuning
By default, GPT 3.5 isn't an expert title discriminator out of the box, unless it has been shown some examples of machine-generated titles. The following results show the weakness of zero-shot prompting.

In [27]:
gpt35_id = "gpt-3.5-turbo"

In [43]:
llm_as_discriminator("LSTM_attention.csv", model_id=gpt35_id)

1000it [06:15,  2.67it/s]

Discrimination accuracy: 56.3
Average confidence of correct discrimination: 0.2063006265721911
Average confidence of incorrect discrimination: 0.1118991451035638





In [44]:
llm_as_discriminator("LSTM_attention_stopwords.csv", model_id=gpt35_id)

0it [00:00, ?it/s]

1000it [05:50,  2.85it/s]

Discrimination accuracy: 56.9
Average confidence of correct discrimination: 0.21359381651518153
Average confidence of incorrect discrimination: 0.12883935566022092





In [45]:
llm_as_discriminator("LSTM_pointer_generator.csv", model_id=gpt35_id)

0it [00:00, ?it/s]

1000it [05:44,  2.90it/s]

Discrimination accuracy: 60.3
Average confidence of correct discrimination: 0.22049910418284183
Average confidence of incorrect discrimination: 0.14561893402279072





In [46]:
llm_as_discriminator("LSTM_vanilla_data.csv", model_id=gpt35_id)

0it [00:00, ?it/s]

1000it [05:41,  2.93it/s]

Discrimination accuracy: 56.9
Average confidence of correct discrimination: 0.20492478326689284
Average confidence of incorrect discrimination: 0.14349475479734827





## Expert LLM Title Discriminator Results
Once fine-tuned on 20 samples, the LLM Title Discriminator model identifies machine-generated titles at an accuracy over 90% for our LSTM models.

In [47]:
llm_as_discriminator("LSTM_attention.csv", model_id=finetuned_model_id)

Discrimination accuracy: 92.9
Average confidence of correct discrimination: 0.9999971073556101
Average confidence of incorrect discrimination: 0.9994713166898006


In [48]:
llm_as_discriminator("LSTM_attention_stopwords.csv", model_id=finetuned_model_id)

Discrimination accuracy: 96.9
Average confidence of correct discrimination: 0.9999416524272656
Average confidence of incorrect discrimination: 0.9714346741342155


In [49]:
llm_as_discriminator("LSTM_pointer_generator.csv", model_id=finetuned_model_id)

Discrimination accuracy: 98.1
Average confidence of correct discrimination: 0.9999960779608249
Average confidence of incorrect discrimination: 0.999126042695444


In [50]:
llm_as_discriminator("LSTM_vanilla_data.csv", model_id=finetuned_model_id)

Discrimination accuracy: 97.9
Average confidence of correct discrimination: 0.9999895465456974
Average confidence of incorrect discrimination: 0.9993808599113149


## Our GPT-based Title Predictor
The following cells show that fine-tuned LLMs can generate good quality titles at a much better rate. Discriminator Accuracy is only 72%, which is much lower than the LSTM models. 

In [16]:
llm_as_discriminator(
    gpt_title_predictor_result_path,
    model_id=gpt35_id,
    max_infer=100,
    encoding='utf8'
)

100it [00:42,  2.35it/s]

Discrimination accuracy: 52.0
Average confidence of correct discrimination: 0.3273687028695172
Average confidence of incorrect discrimination: 0.3380004666446201





In [17]:
llm_as_discriminator(
    gpt_title_predictor_result_path,
    model_id=finetuned_model_id,
    max_infer=100,
    encoding='utf8'
)

100it [00:36,  2.77it/s]

Discrimination accuracy: 73.0
Average confidence of correct discrimination: 0.9997739809611879
Average confidence of incorrect discrimination: 0.9999539307236794





# Examples of some "good titles"
In the following cells, we'll see for which reference-prediction pairs the Title Discriminator failed. We'll see that vanilla GPT-3.5-Turbo often fails when the predicted title clearly reads wrong, incomplete, or unprofessional. On the other hand, when the expert / fine-tuned Title Discriminator fails, the titles are hard to distinguish quality-wise. 

In [25]:
llm_as_discriminator("LSTM_attention.csv", model_id=gpt35_id, break_if_incorrect=True)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Confused example:
Abstract: report domestic intelligence agency listed 400 instances soldiers police officers intelligence officials suspected extremist actions posing “a significant danger ” 
Reference: right extremism taints german security services hundreds cases 
Prediction:  mexico police arrest





In [26]:
llm_as_discriminator("LSTM_attention_stopwords.csv", model_id=gpt35_id, break_if_incorrect=True)

2it [00:01,  1.47it/s]

Confused example:
Abstract: top official in russia’s air force said the government was considering whether to base strategic bombers out of cuban territory or on venezuelan island 
Reference: russia is weighing latin bases general says 
Prediction:  russia and russia agree to discuss nato ties





In [35]:
llm_as_discriminator("LSTM_pointer_generator.csv", model_id=gpt35_id, break_if_incorrect=True)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Confused example:
Abstract: lawsuit filed in march by <unk> <unk> <unk> card reader claims that her mother had <unk> love <unk> with the painter in the 1950s 
Reference: lawsuit in spain seeks recognition of another <unk> creation daughter 
Prediction:  the man who survived the death of the death





In [28]:
llm_as_discriminator("LSTM_vanilla_data.csv", model_id=gpt35_id, break_if_incorrect=True)

0it [00:00, ?it/s]

Confused example:
Abstract: a court backed decision by the bangladeshi central bank to remove muhammad yunus the nobel laureate from bank the institution that he founded 
Reference:  removal of bank founder upheld 
Prediction:  pakistani court rules for killing of ex premier





In [30]:
llm_as_discriminator("LSTM_attention.csv", model_id=finetuned_model_id, break_if_incorrect=True)

0it [00:00, ?it/s]

12it [00:02,  4.22it/s]

Confused example:
Abstract: read latest updates mr christie 
Reference: chris christie climate change 
Prediction:  chris christie immigration





In [31]:
llm_as_discriminator("LSTM_attention_stopwords.csv", model_id=finetuned_model_id, break_if_incorrect=True)

0it [00:00, ?it/s]

Confused example:
Abstract: prime minister yasuo fukuda of japan made the move in bid to raise his low approval ratings 
Reference: japanese premier reshuffles cabinet 
Prediction:  japan’s prime minister resigns over cabinet





In [33]:
llm_as_discriminator("LSTM_pointer_generator.csv", model_id=finetuned_model_id, break_if_incorrect=True)

0it [00:00, ?it/s]

11it [00:02,  4.51it/s]

Confused example:
Abstract: the case of two families in siberia has stirred complaints about russia’s <unk> houses high volume operations where women are often treated <unk> 
Reference: russia stays transfixed by switch at birth 
Prediction:  russian court orders release of russian woman





In [36]:
llm_as_discriminator("LSTM_vanilla_data.csv", model_id=finetuned_model_id, break_if_incorrect=True)

41it [00:08,  4.81it/s]

Confused example:
Abstract: with no to the fighting in yemen the united nations and other aid agencies said the humanitarian crisis there was worsening 
Reference:  yemeni army tries to oil fields as qaeda fighters advance 
Prediction:  u n says it will send aid to rebels in yemen





In [41]:
llm_as_discriminator(
    gpt_title_predictor_result_path,
    model_id=finetuned_model_id,
    break_if_incorrect=True,
    encoding='utf8',
    max_infer=100
)

0it [00:00, ?it/s]

2it [00:00,  3.29it/s]

Confused example:
Abstract: A giant lobster named Larry is at the center of a small town’s identity. What happens if he leaves? And what does a dispute about its future say about the culture of Australia’s “Big Things”?
Reference: For Sale: 55-Foot-Tall Lobster. Owners in a Pinch. Can You Help?
Prediction: Who Owns Larry? Debating the Glories of Australia’s ‘Big Things’





The last example shown above reveals why we need such a Title Discriminator metric instead of relying of Rouge or BLEU or other such metrics. The LLM-predicted title is quite good. It's completely different from the reference title, but it's compelling and conveys the point of the article well. Although this sample would generate a very bad Rouge or BLEU score, it should be considered an excellent predicted sample, which our LLM Title Discriminator confirms. 

# Effect of Synthetic Data

## Without synthetic data

In [3]:
compute_metrics("lstm/LSTM_attention_stopwords_10k.csv")

{'rouge1': 0.08348140517989147, 'rouge2': 0.014349147942793569, 'rougeL': 0.08001952359935977, 'rougeLsum': 0.0799971308931128}
{'bleu': 0.0, 'precisions': [0.06989139179404666, 0.011364954192276469, 0.0002738975623116954, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0049519959575544, 'translation_length': 9944, 'reference_length': 9895}
{'meteor': 0.048710093053581115}


In [12]:
llm_as_discriminator("lstm/LSTM_attention_stopwords_10k.csv", model_id=finetuned_model_id)

1000it [04:34,  3.64it/s]

Discrimination accuracy: 99.7
Average confidence of correct discrimination: 0.9999991512053142
Average confidence of incorrect discrimination: 1.0





## With synthetic data

In [9]:
compute_metrics("lstm/LSTM_attention_stopwords_synthetic20k.csv")

{'rouge1': 0.06097515982152964, 'rouge2': 0.010941300818169103, 'rougeL': 0.059457546833253744, 'rougeLsum': 0.059268668747649134}
{'bleu': 0.0, 'precisions': [0.06699958385351644, 0.011427596793450453, 0.0006641576267434137, 0.0], 'brevity_penalty': 0.6681477531377876, 'length_ratio': 0.7126334519572953, 'translation_length': 7209, 'reference_length': 10116}
{'meteor': 0.03325435038898777}


In [10]:
llm_as_discriminator("lstm/LSTM_attention_stopwords_synthetic20k.csv", model_id=finetuned_model_id)

1000it [04:56,  3.37it/s]

Discrimination accuracy: 98.9
Average confidence of correct discrimination: 0.9999496258726065
Average confidence of incorrect discrimination: 0.9770476898962854





# Effect of Embeddings

In [25]:
compute_metrics("word2vec_output.csv", encoding='utf8')

Removing start and end
Removing start and end
{'rouge1': 0.17377642099043022, 'rouge2': 0.04582447776681522, 'rougeL': 0.1632700850061531, 'rougeLsum': 0.16395885096155988}
{'bleu': 0.027436297784275653, 'precisions': [0.18723404255319148, 0.0512396694214876, 0.015841584158415842, 0.007407407407407408], 'brevity_penalty': 0.8422897475758973, 'length_ratio': 0.8535108958837773, 'translation_length': 705, 'reference_length': 826}
{'meteor': 0.1190630275488683}


In [26]:
llm_as_discriminator(
    "word2vec_output.csv",
    model_id=finetuned_model_id,
    encoding='utf8',
    max_infer=100
)

100it [00:23,  4.35it/s]

Discrimination accuracy: 95.0
Average confidence of correct discrimination: 0.9999997597620491
Average confidence of incorrect discrimination: 0.9999997169473731





In [28]:
llm_as_discriminator(
    "word2vec_output.csv",
    model_id=gpt35_id,
    encoding='utf8',
    max_infer=100
)

100it [00:39,  2.53it/s]

Discrimination accuracy: 51.0
Average confidence of correct discrimination: 0.15412516972964677
Average confidence of incorrect discrimination: 0.12357379883057334



