<a href="https://colab.research.google.com/github/UrologyUnbound/SIOP_ML_2024_Discord/blob/main/colabs/Tune_Clarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rating Item Clarity Model Fine Tuning

The goal of this notebook is to fine tune the OpenAI GPT 3.5 model to be able to predict the average clarity rating for each personality item based on responses.

## Inputs and Setup

In [1]:
!pip install --upgrade tiktoken openai

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.16.1-py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.9/266.9 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.

In [2]:
!pip install git+https://github.com/wandb/wandb.git@e688ecc9a816e12aef82878e2ab12befe678a3e6

Collecting git+https://github.com/wandb/wandb.git@e688ecc9a816e12aef82878e2ab12befe678a3e6
  Cloning https://github.com/wandb/wandb.git (to revision e688ecc9a816e12aef82878e2ab12befe678a3e6) to /tmp/pip-req-build-qb96crph
  Running command git clone --filter=blob:none --quiet https://github.com/wandb/wandb.git /tmp/pip-req-build-qb96crph
  Running command git rev-parse -q --verify 'sha^e688ecc9a816e12aef82878e2ab12befe678a3e6'
  Running command git fetch -q https://github.com/wandb/wandb.git e688ecc9a816e12aef82878e2ab12befe678a3e6
  Running command git checkout -q e688ecc9a816e12aef82878e2ab12befe678a3e6
  Resolved https://github.com/wandb/wandb.git to commit e688ecc9a816e12aef82878e2ab12befe678a3e6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb==0.16.6.dev1)
 

In [3]:
import json
import openai
import os
import pandas as pd
from pprint import pprint
import tiktoken
from sklearn.model_selection import train_test_split
from google.colab import userdata
import numpy as np
from collections import defaultdict
from wandb.integration.openai.fine_tuning import WandbLogger


client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
encoding = tiktoken.get_encoding("cl100k_base")

WANDB_PROJECT = "OpenAI-Clarity-Fine-Tune"

In [4]:
#Estimated token counter
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

## Data Import and Prep

In [5]:
df_clarity = pd.read_csv("https://raw.githubusercontent.com/UrologyUnbound/SIOP_ML_2024_Discord/main/data/train/clarity_train.csv")
df_clarity.clarity = df_clarity.clarity.astype(str)
df_clarity.head()

Unnamed: 0,_id,personality_item,clarity
0,0,Am considered well-off financially.,3.421052631578947
1,1,Make problems bigger than they are.,6.545454545454546
2,2,Judge people by their appearance.,6.545454545454546
3,3,"Did not feel like eating, even though I should...",3.75
4,4,Feel that very few merchants take advantage of...,5.210526315789473


In [6]:
df_clarity.dtypes

_id                  int64
personality_item    object
clarity             object
dtype: object

In [7]:
df_clarity.iloc[df_clarity.personality_item.str.len().idxmax()].personality_item

'Believe that one needs to show their talents and abilities in order to get opportunities and make progress.'

In [9]:
x_train,x_test,y_train,y_test = train_test_split(df_clarity["personality_item"],df_clarity["clarity"],random_state=42,test_size = 0.2, shuffle = True)
train_data = pd.concat([x_train , y_train], axis = 1)
test_data = pd.concat([x_train , y_train], axis = 1)
train_data.head()


Unnamed: 0,personality_item,clarity
28,Work longer hours than most people.,6.375
24,Am able to work hard to achieve results that I...,3.538461538461538
12,Believe that one needs to show their talents a...,5.25
0,Am considered well-off financially.,3.421052631578947
4,Feel that very few merchants take advantage of...,5.210526315789473


In [10]:
system_message = """
Your task is to predict the average clarity rating for each item based on the responses. Respondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear. The output should not make up information and not reference these given instructions or context; only output the answer.
"""

def create_user_message(row):
    return row['personality_item']


def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": row["clarity"]})

    return {"messages": messages}


pprint(prepare_example_conversation(train_data.iloc[0]))

{'messages': [{'content': '\n'
                          'Your task is to predict the average clarity rating '
                          'for each item based on the responses. Respondents '
                          'rated the clarity of personality test items using a '
                          '7-point scale from 1 = extremely unclear to 7 = '
                          'extremely clear. The output should not make up '
                          'information and not reference these given '
                          'instructions or context; only output the answer.\n',
               'role': 'system'},
              {'content': 'Work longer hours than most people.',
               'role': 'user'},
              {'content': '6.375', 'role': 'assistant'}]}


In [11]:
training_json = train_data.apply(prepare_example_conversation, axis=1).tolist()
test_json = test_data.apply(prepare_example_conversation, axis=1).tolist()


for example in training_json[:5]:
    print(example)

{'messages': [{'role': 'system', 'content': '\nYour task is to predict the average clarity rating for each item based on the responses. Respondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear. The output should not make up information and not reference these given instructions or context; only output the answer.\n'}, {'role': 'user', 'content': 'Work longer hours than most people.'}, {'role': 'assistant', 'content': '6.375'}]}
{'messages': [{'role': 'system', 'content': '\nYour task is to predict the average clarity rating for each item based on the responses. Respondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear. The output should not make up information and not reference these given instructions or context; only output the answer.\n'}, {'role': 'user', 'content': 'Am able to work hard to achieve results that I will only get at a time far in

In [12]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)


In [13]:
training_file_name = "tmp_clarity_finetune_training.jsonl"
write_jsonl(training_json, training_file_name)

testing_file_name = "tmp_clarity_finetune_testing.jsonl"
write_jsonl(test_json, testing_file_name)

In [14]:
!head -n 5 tmp_clarity_finetune_training.jsonl

{"messages": [{"role": "system", "content": "\nYour task is to predict the average clarity rating for each item based on the responses. Respondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear. The output should not make up information and not reference these given instructions or context; only output the answer.\n"}, {"role": "user", "content": "Work longer hours than most people."}, {"role": "assistant", "content": "6.375"}]}
{"messages": [{"role": "system", "content": "\nYour task is to predict the average clarity rating for each item based on the responses. Respondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear. The output should not make up information and not reference these given instructions or context; only output the answer.\n"}, {"role": "user", "content": "Am able to work hard to achieve results that I will only get at a time far in

### Pre-Tuning Checks

In [15]:
# Format error checks - Training set
with open("/content/tmp_clarity_finetune_testing.jsonl", 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [16]:
# Format error checks - Training set
with open("/content/tmp_clarity_finetune_training.jsonl", 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [17]:
# Warnings and tokens counts
with open("/content/tmp_clarity_finetune_testing.jsonl", 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 90, 110
mean / median: 96.25, 95.5
p5 / p95: 91.3, 103.5

#### Distribution of num_assistant_tokens_per_example:
min / max: 3, 7
mean / median: 5.375, 7.0
p5 / p95: 3.0, 7.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [18]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print(f"Estimated training cost ~${((n_epochs * n_billing_tokens_in_dataset)/1000)*.0080}")

Dataset has ~2310 tokens that will be charged for during training
By default, you'll train for 4 epochs on this dataset
By default, you'll be charged for ~9240 tokens
Estimated training cost ~$0.07392


In [19]:
with open(training_file_name, "rb") as training_fd:
    training_response = client.files.create(
        file=training_fd, purpose="fine-tune"
    )

training_file_id = training_response.id

with open(testing_file_name, "rb") as validation_fd:
    validation_response = client.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-IjNga4e46wKd36Y5iQIQISa8
Validation file ID: file-szRh8KZx8j316qStjAkQDCHe


## Fine Tuning

In [20]:
# Only Run this cell when wanting to create a new fine-tuning job, otherwise you will be paying to redo work

# Uncomment the below code when wanting to run a new fine-tuning job
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
      hyperparameters = {"n_epochs":3, "batch_size":"auto", "learning_rate_multiplier":2},
    suffix="clarity_tuned_v2",
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-AKLbMNJ7ApVamfmk9xpW4PXN
Status: validating_files


In [21]:
WandbLogger.sync(fine_tune_job_id=job_id, project=WANDB_PROJECT, openai_client=client)

[34m[1mwandb[0m: Retrieving fine-tune job...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mz-markofsky[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Waiting for the OpenAI fine-tuning job to finish training...
[34m[1mwandb[0m: To avoid blocking, you can call `WandbLogger.sync` with `wait_for_job_success=False` after OpenAI training completes.
[34m[1mwandb[0m: Fine-tuning finished, logging metrics, model metadata, and run metadata to Weights & Biases
[34m[1mwandb[0m: Logging training/validation files...


VBox(children=(Label(value='0.084 MB of 0.084 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▂▁▃▄▂▄▅▅▇▆▆▄▅▆▆▅▆▆▆▄▅▆█▆▇▄▅▇▄▇▆▆▄▇▆▆█▆▄█
train_loss,█▇▆▂▇▄▇▄▂▃▂▂▂▃▂▃▃▁▂▂▃▂▂▂▂▂▃▂▃▂▂▁▂▂▃▁▁▂▂▁
valid_loss,▇██▂▆▄▅▄▁▄▃▃▂▂▃▂▂▁▂▂▂▂▃▁▁▂▁▂▁▂▁▂▂▂▂▁▁▁▂▁
valid_mean_token_accuracy,▁▃▅▄▃▆▃▄▇▅▅▄▇▆▅▆▄▇█▆▄▇▅█▆▇▆▆▇█▇█▆▄▄▆█▆▇▆

0,1
fine_tuned_model,ft:gpt-3.5-turbo-012...
status,succeeded
train_accuracy,0.88889
train_loss,0.7146
valid_loss,0.5747
valid_mean_token_accuracy,0.77778


'🎉 wandb sync completed successfully'

In [25]:
WandbLogger.sync(fine_tune_job_id="ftjob-nZm5N643P9pHXqzQphu5KrwE", project=WANDB_PROJECT, openai_client=client)

[34m[1mwandb[0m: Retrieving fine-tune job...


[34m[1mwandb[0m: Waiting for the OpenAI fine-tuning job to finish training...
[34m[1mwandb[0m: To avoid blocking, you can call `WandbLogger.sync` with `wait_for_job_success=False` after OpenAI training completes.
[34m[1mwandb[0m: Fine-tuning finished, logging metrics, model metadata, and run metadata to Weights & Biases
[34m[1mwandb[0m: Logging training/validation files...


VBox(children=(Label(value='0.084 MB of 0.084 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▃▂▅▁▆▆▅▅▄▄▄▅▅▄▇█▄█▆▆▄▇▄▆▆█▆▆▄▆▆▆▆▄▆▄▄██▇
train_loss,████▇▄▄▄▅▄▂▃▄▃▂▁▄▂▂▂▃▂▂▂▂▂▁▂▂▂▂▂▁▃▂▂▂▂▁▂
valid_loss,█▄▆▆▅▄▁▂▃▂▃▂▂▂▂▂▂▂▂▂▃▂▂▂▂▂▁▁▂▂▂▂▂▂▂▂▁▁▂▂
valid_mean_token_accuracy,▁▄▆▅▆▁▄█▅▄▁▄▄▆▆▁▄█▆▆▁▅▄█▆▁▆█▁▆▆▅▄█▆▄▆▆▁▆

0,1
fine_tuned_model,ft:gpt-3.5-turbo-012...
status,succeeded
train_accuracy,0.77778
train_loss,0.53928
valid_loss,0.52384
valid_mean_token_accuracy,0.55556


'🎉 wandb sync completed successfully'

In [22]:
#Check Job Status
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print("Trained Tokens:", response.trained_tokens)


Job ID: ftjob-AKLbMNJ7ApVamfmk9xpW4PXN
Status: succeeded
Trained Tokens: 6786


In [23]:
#Track Fine-Tuning Endpoints
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Step 55/72: training loss=0.83, validation loss=0.68
Step 56/72: training loss=0.71, validation loss=0.80
Step 57/72: training loss=0.60, validation loss=0.94
Step 58/72: training loss=0.86, validation loss=0.48
Step 59/72: training loss=0.95, validation loss=0.81
Step 60/72: training loss=0.56, validation loss=0.85
Step 61/72: training loss=0.77, validation loss=1.02
Step 62/72: training loss=1.26, validation loss=0.95
Step 63/72: training loss=0.98, validation loss=0.91
Step 64/72: training loss=0.73, validation loss=0.55
Step 65/72: training loss=0.70, validation loss=1.55
Step 66/72: training loss=0.48, validation loss=0.72
Step 67/72: training loss=0.93, validation loss=0.72
Step 68/72: training loss=1.20, validation loss=0.67
Step 69/72: training loss=1.00, validation loss=1.24
Step 70/72: training loss=0.98, validation loss=0.97
Step 71/72: training loss=1.55, validation loss=1.24
Step 72/72: training loss=0.71, validation loss=0.57
New fine-tuned model created: ft:gpt-3.5-turbo

In [24]:
# When job is done, run to fets fine-tuned model id
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

if fine_tuned_model_id is None:
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)

Fine-tuned model ID: ft:gpt-3.5-turbo-0125:personal:clarity-tuned-v2:99lDHEKR


## Fine-Tuned Model Testing

In [None]:
df_clarity_dev = pd.read_csv("https://raw.githubusercontent.com/UrologyUnbound/SIOP_ML_2024_Discord/main/data/dev/clarity_val_public.csv")

In [None]:
test_row = df_clarity_dev.iloc[2]
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = create_user_message(test_row)
test_messages.append({"role": "user", "content": user_message})

pprint(test_messages)

[{'content': '\n'
             'Your task is to predict the average clarity rating for each item '
             'based on the responses. Respondents rated the clarity of '
             'personality test items using a 7-point scale from 1 = extremely '
             'unclear to 7 = extremely clear. The output should not make up '
             'information and not reference these given instructions or '
             'context; only output the answer.\n',
  'role': 'system'},
 {'content': 'Have no sympathy for rule-breakers.', 'role': 'user'}]


In [None]:
response = client.chat.completions.create(
    model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=500
)
print(response.choices[0].message.content)

5.0
