In [1]:
import os
import openai
import json
import numpy as np
import pandas as pd
from autolabel import LabelingAgent, AutolabelDataset
from datasets import load_dataset as ld
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import time

In [2]:
openai.api_key = "your_api_key_here"

In [3]:
from datasets import load_dataset as ld

def load_dataset_as_pandas_dataframe(
    dataset_name: str="trec"
):

    dataset = ld(dataset_name)
    df = dataset["train"].to_pandas()
    df = df.sample(frac=1)

    df_test = dataset["test"].to_pandas()
    df_test = df_test.sample(frac=1)
    
    return df, df_test

In [4]:
df, df_test = load_dataset_as_pandas_dataframe()

In [5]:
id_to_label = {0 : 'ABBR' , 1 : 'ENTY', 2: 'DESC', 3 : 'HUM', 4 : 'LOC', 5 : 'NUM'}
label_to_id= { 'ABBR' : 0 , 'ENTY' : 1, 'DESC': 2, 'HUM' :3, 'LOC' : 4, 'NUM' : 5}
categories = list(label_to_id.keys())

# DataFrame that con

In [6]:
df["coarse_label"] = df["coarse_label"].map(id_to_label)
df_test["coarse_label"] = df_test["coarse_label"].map(id_to_label)

# Let create a text_to_openai_json function

In [18]:
def text_to_openai_json(data,filename):
    """
    Converts a given dataset into a JSON Lines (JSONL) file suitable for OpenAI's GPT-3.5 turbo model.
    
    Args:
        data (DataFrame or similar data structure): Input data containing text and labels.

    The function processes the input data row by row, constructing conversations for each row with a system message, user message, and an assistant message. It then writes the generated conversation data to a JSONL file.
 
    """
    # Initialize an empty list to store conversation data
    message_list = []

    # Iterate through the rows in the input data
    for _, row in data.iterrows():
        # Create a system message as an initial instruction
        system_message = {
            "role": "system",
            "content": f"given the following text: find the category in: {categories} that is most closely associated with it. Return only the category name"
        }

        # Append the system message to the conversation
        message_list.append({"messages": [system_message]})

        # Create a user message based on the 'text' column from the data
        user_message = {
            "role": "user",
            "content": row['example']
        }

        # Append the user message to the conversation
        message_list[-1]["messages"].append(user_message)

        # Create an assistant message based on the 'coarse_label' column from the data
        assistant_message = {
            "role": 'assistant',
            "content": row['label']
        }

        # Append the assistant message to the conversation
        message_list[-1]["messages"].append(assistant_message)

    # Write the conversation data to a JSON Lines (JSONL) file
    with open(filename, "w") as json_file:
        for message in message_list:
            # Serialize the conversation data to JSON and write it to the file
            json.dump(message, json_file)
            json_file.write("\n")




# Few Shot Prompting


In [20]:
system_content =  f"given the following text: find the category in: {categories} that is most closely associated with it. Return only the category name only in following format"

In [23]:
from sklearn.metrics import precision_recall_fscore_support

In [24]:
def few_shot_prompting(data,model_id):
    pred = []
    for row in data["text"]:
        completion = openai.ChatCompletion.create(
            model= model_id ,
            messages=[
                {"role": "system", "content": system_content },
                {"role": "user", "content": row }
            ])
        
        print(f'text: {row}')
        print(completion.choices[0].message.content)
        pred.append(completion.choices[0].message.content)
    pred_df = pd.DataFrame({'text': data["text"], 'coarse_label' : data['coarse_label'], 'few-shot predictions' : pred })
    

    return pred_df

In [25]:
cumulative_increment = 0
model_id = 'gpt-3.5-turbo-0301'
all_model_id = []

In [27]:
def fine_tune_model(data, model_id,num_label):
    pred_df = few_shot_prompting(data=data, model_id=model_id)
    incorrection_pred_df = pred_df[pred_df['coarse_label'] != pred_df['few-shot predictions']][:num_label]
    filename = f'ft_increment_{num_label}.jsonl'
    text_to_openai_json(incorrection_pred_df, filename)
    loader = openai.File.create(file=open(filename, "rb"), purpose='fine-tune')
    fine_tuning_job = openai.FineTuningJob.create(training_file=loader.id, model="gpt-3.5-turbo")
    return fine_tuning_job.id

In [28]:
def wait_for_fine_tuning(job_id):
    while True:
        response = openai.FineTuningJob.retrieve(job_id)
        print(response["fine_tuned_model"])
        if response["fine_tuned_model"]:
            print(response["fine_tuned_model"])
            return response["fine_tuned_model"]
        time.sleep(60)  # Check every 60 seconds

In [65]:
def ft_accuracy(data,model_id):
    pred = []
    for row in data["text"]:
        completion = openai.ChatCompletion.create(
            model= model_id ,
            messages=[
                {"role": "system", "content": system_content },
                {"role": "user", "content": row }
            ])
        
        print(f'text: {row}')
        print(completion.choices[0].message.content)
        pred.append(completion.choices[0].message.content)
        
    accuracy = accuracy_score(data['coarse_label'], pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')
    precision, recall, f1, _ = precision_recall_fscore_support(data['coarse_label'], pred, average='macro',zero_division=1)
    
    return accuracy, precision, recall, f1, pred



In [37]:
df_test.head()

Unnamed: 0,text,coarse_label,fine_label
21,What is a biosphere ?,DESC,24
274,What is peyote ?,DESC,24
312,How many hearts does an octopus have ?,NUM,38
94,What is the oldest university in the US ?,HUM,28
384,How old was the youngest president of the Unit...,NUM,44


# Fine-Tuning GPT-3.5 Turbo on Iterative Label Increase

In [50]:
model_ids = []
label_count = 0
for i in range(5):
    ft_id = fine_tune_model(data=df_test,model_id='gpt-3.5-turbo-0301',num_label=label_count)
    if wait_for_fine_tuning(ft_id) is not None:
        model_ids.append(wait_for_fine_tuning(ft_id))
        

text: What is a biosphere ?
Category: DESC
text: What is peyote ?
Category: DESC
text: How many hearts does an octopus have ?
Category: NUM
text: What is the oldest university in the US ?
Sorry, I cannot perform web searches as it goes beyond my capabilities as a language AI model. However, I can help you with any question that can be answered through my pre-existing knowledge.
text: How old was the youngest president of the United States ?
DESC
text: What is a prism ?
Category: DESC
text: What is the population of Venezuela ?
DESC
text: Where are the National Archives ?
DESC
text: What is a parasite ?
Category: DESC
text: When was the first liver transplant ?
Category: NUM
text: What is cryogenics ?
Category: DESC
text: What is compounded interest ?
Category: DESC
text: What is desktop publishing ?
The category most closely associated with the text "What is desktop publishing?" is DESC (Description).
text: What are the colors of the German flag ?
Category: DESC
text: What is solar win

In [83]:
accs = [] 
recalls = [] 
precisions = []
f1s = []
labels = []

In [82]:
model_ids

['ft:gpt-3.5-turbo-0613:personal::8Lx9sXvQ',
 'ft:gpt-3.5-turbo-0613:personal::8LxJN5An',
 'ft:gpt-3.5-turbo-0613:personal::8LxSHvL9',
 'ft:gpt-3.5-turbo-0613:personal::8LxcpVnm',
 'ft:gpt-3.5-turbo-0613:personal::8LxoRznC']

# Evaluation of Fine-Tuned Models with Increasing Label Counts

In [1]:
label_count = 0
for model_id in model_ids:
    label_count += 10
    labels.append(label_count)
    accuracy, precision, recall, f1, pred = ft_accuracy(data=df_test,model_id=model_id)
    accs.append(accuracy), precisions.append(precision), recalls.append(recall), f1s.append(f1)

'label_count = 0\nfor model_id in model_ids:\n    label_count += 10\n    labels.append(label_count)\n    accuracy, precision, recall, f1, pred = ft_accuracy(data=df_test,model_id=model_id)\n    accs.append(accuracy), precisions.append(precision), recalls.append(recall), f1s.append(f1)'

In [None]:
## Fine-Tuning GPT-3.5 Turbo on Iterative Label Increase
model_ids2 = []
label_count = 50
for i in range(5):
    label_count += 10 
    ft_id = fine_tune_model(data=df_test,model_id='gpt-3.5-turbo-0301',num_label=label_count)
    if wait_for_fine_tuning(ft_id) is not None:
        model_ids2.append(wait_for_fine_tuning(ft_id))

text: What is a biosphere ?
Category: DESC
text: What is peyote ?
CATEGORY: DESC
text: How many hearts does an octopus have ?
The category associated with the given text is 'NUM'.
text: What is the oldest university in the US ?
I'm sorry, but I cannot provide current information as my training data only goes up until 2021. As of 2021, the oldest university in the US is considered to be Harvard University, founded in 1636.
text: How old was the youngest president of the United States ?
The category associated with the text is 'NUM'.
text: What is a prism ?
Category: DESC
text: What is the population of Venezuela ?
DESC
text: Where are the National Archives ?
DESC
text: What is a parasite ?
Category: DESC
text: When was the first liver transplant ?
Category: NUM
text: What is cryogenics ?
DESC
text: What is compounded interest ?
Category: DESC
text: What is desktop publishing ?
Category: DESC
text: What are the colors of the German flag ?
DESC
text: What is solar wind ?
Category: DESC
te

In [81]:
model_ids2

['ft:gpt-3.5-turbo-0613:personal::8LzCuve6',
 'ft:gpt-3.5-turbo-0613:personal::8LzPDDw6',
 'ft:gpt-3.5-turbo-0613:personal::8LzYCZrj',
 'ft:gpt-3.5-turbo-0613:personal::8LzkAk4W',
 'ft:gpt-3.5-turbo-0613:personal::8M00ibGj']

In [1]:
# Evaluation of Fine-Tuned Models with Increasing Label Counts
label_count = 50
for model_id in model_ids2:
    label_count += 10
    labels.append(label_count)
    accuracy, precision, recall, f1, pred = ft_accuracy(data=df_test,model_id=model_id)
    accs.append(accuracy), precisions.append(precision), recalls.append(recall), f1s.append(f1)

'label_count = 50\nfor model_id in model_ids2:\n    label_count += 10\n    labels.append(label_count)\n    accuracy, precision, recall, f1, pred = ft_accuracy(data=df_test,model_id=model_id)\n    accs.append(accuracy), precisions.append(precision), recalls.append(recall), f1s.append(f1)'

In [None]:
## Fine-Tuning GPT-3.5 Turbo on Iterative Label Increase
model_ids3 = []
label_count = 100
for i in range(5):
    label_count += 10 
    ft_id = fine_tune_model(data=df_test,model_id='gpt-3.5-turbo-0301',num_label=label_count)
    if wait_for_fine_tuning(ft_id) is not None:
        model_ids3.append(wait_for_fine_tuning(ft_id))

In [None]:
# Evaluation of Fine-Tuned Models with Increasing Label Counts
label_count = 100
for model_id in model_ids3:
    label_count += 10
    labels.append(label_count)
    accuracy, precision, recall, f1, pred = ft_accuracy(data=test_df,model_id=model_id)
    accs.append(accuracy), precisions.append(precision), recalls.append(recall), f1s.append(f1)