In [31]:
from pandas import read_csv
import evaluate

rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
meteor = evaluate.load('meteor')

[nltk_data] Error loading wordnet: HTTP Error 429: Too Many Requests
[nltk_data] Error loading punkt: HTTP Error 429: Too Many Requests
[nltk_data] Error loading omw-1.4: HTTP Error 429: Too Many Requests


In [25]:
def compute_metrics(filepath):
    df = read_csv(filepath, encoding='cp1252')
    references = df["True Title"]
    predictions = df["Predicted Title"]
    print(rouge.compute(predictions=predictions,references=references))
    print(bleu.compute(predictions=predictions,references=[[ref] for ref in references]))
    print(meteor.compute(predictions=predictions,references=references))

In [21]:
compute_metrics("LSTM_attention.csv")

10182
10182
{'rouge1': 0.1366115222470731, 'rouge2': 0.04254364167446087, 'rougeL': 0.13179486331567025, 'rougeLsum': 0.1318208908662652}
{'bleu': 0.036445868433343596, 'precisions': [0.1688180085410283, 0.06265792419638573, 0.034361192065431985, 0.03463917525773196], 'brevity_penalty': 0.6118439469006668, 'length_ratio': 0.6705657752107298, 'translation_length': 41447, 'reference_length': 61809}
{'meteor': 0.09324901248878967}


In [22]:
compute_metrics("LSTM_attention_stopwords.csv")

9838
9838
{'rouge1': 0.15158954967586952, 'rouge2': 0.04889780780488909, 'rougeL': 0.14419852127153018, 'rougeLsum': 0.14434043878556196}
{'bleu': 0.036447028298634054, 'precisions': [0.15408003706492152, 0.04847205807867635, 0.023869779118498573, 0.01753720971369368], 'brevity_penalty': 0.8667631546259996, 'length_ratio': 0.8748986623429267, 'translation_length': 69068, 'reference_length': 78944}
{'meteor': 0.11078987532467596}


In [26]:
compute_metrics("LSTM_pointer_generator.csv")

{'rouge1': 0.1429085850744704, 'rouge2': 0.04106810454799074, 'rougeL': 0.13470933453489325, 'rougeLsum': 0.1348892319191301}
{'bleu': 0.03166583165349626, 'precisions': [0.14564057307538505, 0.04218493030385285, 0.021559815689174585, 0.016730836541827093], 'brevity_penalty': 0.8207110764935666, 'length_ratio': 0.835014392630973, 'translation_length': 72521, 'reference_length': 86850}
{'meteor': 0.09750387387678204}


In [27]:
compute_metrics("LSTM_vanilla_data.csv")

{'rouge1': 0.1535061909525322, 'rouge2': 0.04740998192119289, 'rougeL': 0.14479962254003736, 'rougeLsum': 0.14479650082217893}
{'bleu': 0.03548716771466826, 'precisions': [0.15497928703804645, 0.0461686792691408, 0.02268793739182783, 0.016938156034942187], 'brevity_penalty': 0.8714666574092829, 'length_ratio': 0.8790608528988979, 'translation_length': 73384, 'reference_length': 83480}
{'meteor': 0.10596625515984237}


In [12]:
import json
import math

import openai
import random


first_as_token_id = 3983
second_as_token_id = 5686

instruction = """
    You are given an abstract of a published real world news and a pair of possible titles. 
    One of the titles is the official title used by the news agency, which is generated 
    by a professional human. The other one is generated by a machine learning algorithm 
    and can be incorrect, incomplete, vague, or sound unprofessional or unnatural. 
    Your job is to identify whether the first or the second title 
    is the original human-generated title. 
    
    Answer with a single word: "first" or "second", nothing else.
"""


result_files = [
    "LSTM_attention_stopwords.csv",
    "LSTM_attention.csv",
    "LSTM_pointer_generator.csv",
    "LSTM_vanilla_data.csv"
]

def generate_discriminator_training_data():
    with open("discriminator_examples.jsonl", "w") as f:
        for filename in result_files:
            df = read_csv(filename, encoding='cp1252')
            df = df.sample(n=5)
            
            for item in df.iterrows():
                item=item[1]
                
                abstract = item["Abstract"]
                reference = item["True Title"]
                prediction = item["Predicted Title"]
                
                correct_answer = random.choice(["first","second"])
                if correct_answer == "first":
                    first = reference
                    second = prediction
                    incorrect_answer = "second"
                elif correct_answer == "second":
                    first = prediction
                    second = reference
                    incorrect_answer = "first"
                
                prompt = f"""
                    Abstract: {abstract}
                    First title: {first}
                    Second title: {second}
                """
            
                training_sample = {
                    "messages": [
                        {"role": "system", "content": instruction},
                        {"role": "user", "content": prompt},
                        {"role": "assistant", "content": correct_answer}
                    ]
                }
                
                training_json = json.dumps(training_sample)
                f.write(training_json+"\n")


generate_discriminator_training_data()

In [26]:
import time


client = openai.OpenAI()

class OpenAIFineTuner:
    """
    Class to fine tune OpenAI models
    """
    def __init__(self, training_file_path, model_name, suffix):
        self.training_file_path = training_file_path
        self.model_name = model_name
        self.suffix = suffix
        self.file_object = None
        self.fine_tuning_job = None
        self.model_id = None

    def create_openai_file(self):
        self.file_object = client.files.create(
            file=open(self.training_file_path, "rb"),
            purpose="fine-tune",
        )

    def wait_for_file_processing(self, sleep_time=20):
        print(self.file_object.status)
        while self.file_object.status != 'processed':
            time.sleep(sleep_time)
            print("File Status: ", self.file_object.status)

    def create_fine_tuning_job(self):
        self.fine_tuning_job = client.fine_tuning.jobs.create(
            training_file=self.file_object.id,
            model=self.model_name,
            suffix=self.suffix,
        )

    def wait_for_fine_tuning(self, sleep_time=45):
        while client.fine_tuning.jobs.retrieve(self.fine_tuning_job.id).status != 'succeeded':
            time.sleep(sleep_time)
            print("Job Status: ", client.fine_tuning.jobs.retrieve(self.fine_tuning_job.id).status)

    def retrieve_fine_tuned_model(self):
        self.model_id = client.fine_tuning.jobs.retrieve(self.fine_tuning_job.id).fine_tuned_model
        return self.model_id

    def fine_tune_model(self):
        self.create_openai_file()
        self.wait_for_file_processing()
        self.create_fine_tuning_job()
        self.wait_for_fine_tuning()
        return self.retrieve_fine_tuned_model()

fine_tuner = OpenAIFineTuner(
    # put your dumped train.jsonl file here
    training_file_path="discriminator_examples.jsonl",
    model_name="gpt-3.5-turbo-1106",
    suffix="discriminator"
)

In [27]:
fine_tuner.fine_tune_model()
model_id = fine_tuner.model_id
print(model_id)

processed
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  running
Job Status:  succeeded
ft:gpt-3.5-turbo-1106:worcester-polytechnic-institute:discriminator:9CX6Fawj


In [28]:
def discriminate_title_pair_with_llm(abstract,reference,prediction):
    correct_answer = random.choice(["first","second"])
    if correct_answer == "first":
        first = reference
        second = prediction
        incorrect_answer = "second"
    elif correct_answer == "second":
        first = prediction
        second = reference
        incorrect_answer = "first"
    
    
    prompt = f"""
        Abstract: {abstract}
        First title: {first}
        Second title: {second}
    """


    completion = client.chat.completions.create(
        model=model_id,
        messages=[
            {"role": "system", "content": instruction},
            {"role": "user", "content": prompt}
        ],
        logprobs=True,
        logit_bias={first_as_token_id: 100, second_as_token_id: 100},
        max_tokens=1,
        seed=42
    )

    model_completion = completion.choices[0].logprobs
    json_response = json.loads(model_completion.json())
    logprob = json_response["content"][0]["logprob"]
    token = json_response["content"][0]["token"].lower()
    probability = math.exp(logprob)


    if token == correct_answer:
        correct_flag = 1
    elif token == incorrect_answer:
        correct_flag = 0
    else:
        raise RuntimeError(f"{json_response} is unexpected")
    
    return correct_flag, probability

In [15]:
import numpy as np 
from tqdm import tqdm


def llm_as_discriminator(filepath, max_infer=100):
    df = read_csv(filepath, encoding='cp1252')
    total_correct = 0
    probabilities = {
        "correct": [],
        "incorrect": []
    }
    for idx in tqdm(range(max_infer)):
        item = df.iloc[idx]
        abstract = item["Abstract"]
        reference = item["True Title"]
        prediction = item["Predicted Title"]
        correct_flag, probability = discriminate_title_pair_with_llm(abstract,reference,prediction)
        total_correct += correct_flag
        if correct_flag==1:
            probabilities["correct"].append(probability)
        elif correct_flag==0:
            probabilities["incorrect"].append(probability)
        else:
            raise RuntimeError(f"{correct_flag} is unexpected")
    print(f"Discrimination accuracy: {100*total_correct/max_infer}")
    print(f"Average confidence of correct discrimination: {np.mean(probabilities['correct'])}")
    print(f"Average confidence of incorrect discrimination: {np.mean(probabilities['incorrect'])}")

In [29]:
llm_as_discriminator("LSTM_attention.csv")

  0%|                                                                                    | 0/100 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:42<00:00,  2.34it/s]

Discrimination accuracy: 91.0
Average confidence of correct discrimination: 0.9999954968193302
Average confidence of incorrect discrimination: 0.9750113348767333



