In [1]:
import os
import openai
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import time

In [2]:
openai.api_key = "Enter API KEY"

In [2]:
train_df, test_df = pd.read_csv('seed.csv'), pd.read_csv('test.csv')

In [4]:
test_df

Unnamed: 0,example,label
0,Seller: Hi are you interested in buying my Pin...,furniture
1,Buyer: Hello I am interested in your property ...,housing
2,Buyer: Hello . How long have you owned the dre...,furniture
3,Buyer: I am very interested place you have for...,housing
4,"Buyer: Hey, nice car you have here, how long h...",car
...,...,...
995,Buyer: Hi! Seller: Hello. How are you? Buyer:...,furniture
996,Seller: Hi how are you? Buyer: I'm wonderful! ...,electronics
997,Buyer: hello I am interested in the yukon you ...,car
998,"Seller: Hi there, are you interested in my pro...",housing


## Combining DataFrames

In the following operation, the `train_df` and `test_df` DataFrames are combined by stacking them vertically along the rows (axis=0).

```python
combine_df = pd.concat([train_df, test_df], axis=0, ignore_index=True) to ensure Im able to get all categories


In [5]:
combine_df = pd.concat([train_df,test_df],axis=0)

In [6]:
categories = combine_df.label.unique()

# Let create a text_to_openai_json function

In [8]:
def text_to_openai_json(data,filename):
    """
    Converts a given dataset into a JSON Lines (JSONL) file suitable for OpenAI's GPT-3.5 turbo model.
    
    Args:
        data (DataFrame or similar data structure): Input data containing text and labels.

    The function processes the input data row by row, constructing conversations for each row with a system message, user message, and an assistant message. It then writes the generated conversation data to a JSONL file.
 
    """
    # Initialize an empty list to store conversation data
    message_list = []

    # Iterate through the rows in the input data
    for _, row in data.iterrows():
        # Create a system message as an initial instruction
        system_message = {
            "role": "system",
            "content": f"given the following text: find the category in: {categories} that is most closely associated with it. Return only the category name"
        }

        # Append the system message to the conversation
        message_list.append({"messages": [system_message]})

        # Create a user message based on the 'text' column from the data
        user_message = {
            "role": "user",
            "content": row['example']
        }

        # Append the user message to the conversation
        message_list[-1]["messages"].append(user_message)

        # Create an assistant message based on the 'coarse_label' column from the data
        assistant_message = {
            "role": 'assistant',
            "content": row['label']
        }

        # Append the assistant message to the conversation
        message_list[-1]["messages"].append(assistant_message)

    # Write the conversation data to a JSON Lines (JSONL) file
    with open(filename, "w") as json_file:
        for message in message_list:
            # Serialize the conversation data to JSON and write it to the file
            json.dump(message, json_file)
            json_file.write("\n")



# Few Shot Prompting


In [9]:
system_content =  f"given the following text: find the category in: {categories} that is most closely associated with it. Return only the category name only in following format"

In [10]:
from sklearn.metrics import precision_recall_fscore_support

In [20]:
def zero_shot_model(data,model_id):
    pred = []
    for row in data["example"]:
        completion = openai.ChatCompletion.create(
            model= model_id ,
            messages=[
                {"role": "system", "content": system_content },
                {"role": "user", "content": row }
            ])
        
        print(f'text: {row}')
        print(completion.choices[0].message.content)
        pred.append(completion.choices[0].message.content)
    pred_df = pd.DataFrame({'example': data["example"], 'label' : data['label'], 'few-shot predictions' : pred })
    

    return pred_df

In [21]:
cumulative_increment = 0
model_id = 'gpt-3.5-turbo-0301'
all_model_id = []

In [22]:
def fine_tune_model(model_id,num_label,pred_df):
    incorrection_pred_df = pred_df[pred_df['example'] != pred_df['label']][:num_label]
    filename = f'ft_increment_{num_label}.jsonl'
    text_to_openai_json(incorrection_pred_df, filename)
    loader = openai.File.create(file=open(filename, "rb"), purpose='fine-tune')
    fine_tuning_job = openai.FineTuningJob.create(training_file=loader.id, model="gpt-3.5-turbo")
    return fine_tuning_job.id

In [23]:
def wait_for_fine_tuning(job_id):
    while True:
        response = openai.FineTuningJob.retrieve(job_id)
        print(response["fine_tuned_model"])
        if response["fine_tuned_model"]:
            print(response["fine_tuned_model"])
            return response["fine_tuned_model"]
        time.sleep(60)  # Check every 60 seconds

In [24]:
def ft_accuracy(data,model_id):
    pred = []
    for row in data["example"]:
        completion = openai.ChatCompletion.create(
            model= model_id ,
            messages=[
                {"role": "system", "content": system_content },
                {"role": "user", "content": row }
            ])
        
        print(f'example: {row}')
        print(completion.choices[0].message.content)
        pred.append(completion.choices[0].message.content)
        
    accuracy = accuracy_score(data['label'], pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')
    precision, recall, f1, _ = precision_recall_fscore_support(data['label'], pred, average='macro',zero_division=1)
    
    return accuracy, precision, recall, f1, pred



In [25]:
test_df.head()

Unnamed: 0,example,label
0,Seller: Hi are you interested in buying my Pin...,furniture
1,Buyer: Hello I am interested in your property ...,housing
2,Buyer: Hello . How long have you owned the dre...,furniture
3,Buyer: I am very interested place you have for...,housing
4,"Buyer: Hey, nice car you have here, how long h...",car


In [1]:
pred_df = zero_shot_model(model_id='gpt-3.5-turbo-0301', data = test_df)

"pred_df = zero_shot_model(model_id='gpt-3.5-turbo-0301', data = test_df)"

# Fine-Tuning GPT-3.5 Turbo on Iterative Label Increase

In [27]:
model_ids = []
accs = [] 
precisions = [] 
recalls = [] 
f1s = []
label_count = 0
for i in range(15):
    label_count += 10
    ft_id = fine_tune_model(model_id = 'gpt-3.5-turbo-0301', num_label=label_count, pred_df=pred_df)
    if wait_for_fine_tuning(ft_id) is not None:
        model_ids.append(wait_for_fine_tuning(ft_id))
        accuracy, precision, recall, f1, pred = ft_accuracy(data=test_df,model_id=(wait_for_fine_tuning(ft_id)))
        accs.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
        

None
