In [1]:
import openai
import pandas as pd
import numpy as np
import json


In [4]:
# set the description for the cartoon
description_866 = "The image is a black and white cartoon featuring an anthropomorphic Earth with a worried expression, gesturing to a bald man who looks on with a neutral face. Both are standing at a bar, separated by the counter, and a small cup sits next to Earth."


# create a function to sample and save as a json file
def process_before_fine_tune(data, file_path='jsonl_data/jsonl_data_866_100_GPT3_5.jsonl', funny=10, somewhat_funny=30, not_funny=60):

    if funny > len(data[data['label'] == 'funny']):
        print('funny captions is not enough')
    elif somewhat_funny > len(data[data['label'] == 'somewhat_funny']):
        print('somewhat_funny captions is not enough')
    elif not_funny > len(data[data['label'] == 'not_funny']):
        print('not_funny captions is not enough')
    else:
        # divide data into 3 parts by label
        data_0 = data[data['label'] == 'funny']
        data_1 = data[data['label'] == 'somewhat_funny']
        data_2 = data[data['label'] == 'not_funny']

        # set random seed
        np.random.seed(42)

        # sample 10 from data_0, 30 from data_1, 60 from data_2
        data_0 = data_0.sample(funny)
        data_1 = data_1.sample(somewhat_funny)
        data_2 = data_2.sample(not_funny)

        # combine data_0, data_1, data_2
        data = pd.concat([data_0, data_1, data_2])

        # Format the data as required by OpenAI's fine-tuning API
        description_866 = "The image is a black and white cartoon featuring an anthropomorphic Earth with a worried expression, gesturing to a bald man who looks on with a neutral face. Both are standing at a bar, separated by the counter, and a small cup sits next to Earth."
        jsonl_data = data.apply(lambda x: json.dumps({"messages": [{"role": "system", "content": "You are a AI assitant to rating the funniness of a caption for a cartoon"}, 
                                                                {"role": "user", "content": f"This the description for a cartoon: {description_866} This is the caption for the catoon: {x['caption']}. Please rate the funniness of the caption."},
                                                                {"role": "assistant", "content": f"The caption is {x['label']}"}]}), axis=1)

        # Save the JSONL data to a file
        jsonl_file_path = file_path
        with open(jsonl_file_path, 'w') as file:
            for item in jsonl_data:
                file.write(item + '\n')

        return jsonl_file_path


# Show the token count of the jsonl file
def token_count(jsonl_file_path):
    token_count = 0
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            token_count += len(json.loads(line)['messages'][1]['content'].split())
    print(f"Token count: {token_count}")

    
# calculate the accuracy
def accuracy(file_path):
    data = pd.read_csv(file_path)
    data['pred_rating_label'] = data['pred_rating'].apply(lambda x: x.replace('The caption is ', '').replace('.', ''))
    
    # divide data into 3 parts by label
    data_0 = data[data['label'] == 'funny']
    data_1 = data[data['label'] == 'somewhat_funny']
    data_2 = data[data['label'] == 'not_funny']

    # calculate the accuracy for each data part
    accuracy_0 = len(data_0[data_0['label'] == data_0['pred_rating_label']]) / len(data_0)
    accuracy_1 = len(data_1[data_1['label'] == data_1['pred_rating_label']]) / len(data_1)
    accuracy_2 = len(data_2[data_2['label'] == data_2['pred_rating_label']]) / len(data_2)

    # create a dictionary to store the accuracy
    accuracy = {'funny': accuracy_0, 'somewhat_funny': accuracy_1, 'not_funny': accuracy_2}

    return accuracy


# use the fine-tuned model to rate the funniness of captions in the test set
def rate_funniness(test_file_path, model, output_file_path):
  openai.api_key = "YOUR_API_KEY"

  # Load your test data
  test_data = pd.read_csv(test_file_path)

  for i in range(len(test_data.index)):
    response = openai.chat.completions.create(
      model=model,
      messages=[{"role": "system", "content": "You are a AI assitant to rating the funniness of a caption for a cartoon"},
                {"role": "user", "content": f"This the description for a cartoon: {description_866} This is the caption for the catoon: {test_data.loc[i, 'caption']}. Please rate the funniness of the caption."}]
    )
    test_data.loc[i, 'pred_rating'] = response.choices[0].message.content

  # save the result
  test_data.to_csv(output_file_path, index=False)
        

In [2]:
# Load your training data
trian_file_path = '../../data/data_of_contest_866/866_train.csv'  # Replace with your file path
data = pd.read_csv(trian_file_path)

In [87]:
# sample data from 866 data and format it
process_before_fine_tune(data, file_name='jsonl_data/jsonl_data_866_200_GPT3_5.jsonl', funny=40, somewhat_funny=30, not_funny=30)

In [10]:
token_count('jsonl_data/jsonl_data_866_200_GPT3_5.jsonl')

Token count: 15383
Cost: $0.3830367


### Fine-tune the model in openai web

In [5]:
# rete the funniness of captions in the test set
rate_funniness('../../data/data_of_contest_866/866_test.csv', 'ft:gpt-3.5-turbo-1106:personal::8O8xIVLy', 'result/866_100_pred_GPT3_5_433.csv')

In [6]:
accuracy = accuracy('result/866_100_pred_GPT3_5_433.csv')
print(accuracy)

{'funny': 0.6, 'somewhat_funny': 0.16176470588235295, 'not_funny': 0.17708333333333334}
