<a href="https://colab.research.google.com/github/UrologyUnbound/SIOP_ML_2024_Discord/blob/main/colabs/Tune_Fairness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Identifying Fairness Perceptions Model Fine Tuning

The goal of this notebook is to fine tune the OpenAI GPT 3.5 model to be able to correctly identify which of two organizational policies received the majority vote as the fairer option.

## Inputs and Setup

In [None]:
!pip install --upgrade tiktoken openai

In [None]:
!pip install git+https://github.com/wandb/wandb.git@e688ecc9a816e12aef82878e2ab12befe678a3e6

In [2]:
import json
import openai
import os
import pandas as pd
from pprint import pprint
import tiktoken
from sklearn.model_selection import train_test_split
from google.colab import userdata
import numpy as np
from collections import defaultdict

# from wandb.integration.openai.fine_tuning import WandbLogger

client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
encoding = tiktoken.get_encoding("cl100k_base")

# WANDB_PROJECT = "OpenAI-Fairness-Fine-Tune"

In [3]:
#Estimated token counter
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

## Data Import and Prep

In [4]:
df_fairness = pd.read_csv("https://raw.githubusercontent.com/UrologyUnbound/SIOP_ML_2024_Discord/main/data/train/fairness_train.csv")
df_fairness.head()

Unnamed: 0,_id,first_option,second_option,majority_vote
0,0,Conflict Resolution Workshops: We conduct regu...,Conflict Resolution Workbooks: Resources are p...,first
1,1,Conflict Resolution Peer Mentoring: Experience...,Diversity and Inclusion Training: Programs tha...,second
2,2,Mediation sessions are scheduled outside of re...,Employee Conflict Coaches: Coaches work one-on...,second
3,3,Our organization encourages employees to parti...,Conflict Simulation Exercises: Role-playing he...,first
4,4,Grievance Resolution Committee: A committee ad...,We provide employees with a comprehensive hand...,first


In [5]:
# Split the df_fairness dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df_fairness[["first_option", "second_option"]], df_fairness["majority_vote"], random_state=42, test_size=0.2, shuffle = True, stratify= df_fairness["majority_vote"])

train_data = pd.concat([x_train, y_train], axis=1)
test_data = pd.concat([x_test, y_test], axis=1)

train_data.head()

Unnamed: 0,first_option,second_option,majority_vote
21,We have an anonymous suggestion box that allow...,Our organization encourages employees to parti...,second
20,Employee Advocacy Programs: We have advocates ...,Managers are trained to identify and address c...,second
19,We recommend employees seek advice from their ...,We use technology-based solutions to facilitat...,first
23,We recommend employees start a support group t...,Grievance Resolution Committee: A committee ad...,second
2,Mediation sessions are scheduled outside of re...,Employee Conflict Coaches: Coaches work one-on...,second


In [6]:
system_message = """
Respondents compared two organizational policies (first_option and second_option) and voted on which was fairest.
Given the first and second options, your task is to predict which option received the majority vote as the perceived fairer option.
The output should not make up information and not reference these given instructions or context; only output the answer ("first" or "second").
"""

def create_user_message(row):
    # Create the user message with the first and second options
    user_message = f"First Option: {row['first_option']}\nSecond Option: {row['second_option']}"
    return user_message

def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    # Use the majority_vote column as the assistant's message
    messages.append({"role": "assistant", "content": row["majority_vote"]})

    return {"messages": messages}

pprint(prepare_example_conversation(train_data.iloc[0]))

{'messages': [{'content': '\n'
                          'Respondents compared two organizational policies '
                          '(first_option and second_option) and voted on which '
                          'they perceived as the fairest. \n'
                          'Given the first and second options, your task is to '
                          'predict which option received the majority vote as '
                          'the perceived fairer option. \n'
                          'The output should not make up information and not '
                          'reference these given instructions or context; only '
                          'output the answer ("first" or "second").\n',
               'role': 'system'},
              {'content': 'First Option: We have an anonymous suggestion box '
                          'that allows employees to express their concerns, '
                          'providing a confidential outlet for resolving '
                          'is

In [7]:
training_json = train_data.apply(prepare_example_conversation, axis=1).tolist()
test_json = test_data.apply(prepare_example_conversation, axis=1).tolist()


for example in training_json[:5]:
    print(example)

{'messages': [{'role': 'system', 'content': '\nRespondents compared two organizational policies (first_option and second_option) and voted on which they perceived as the fairest. \nGiven the first and second options, your task is to predict which option received the majority vote as the perceived fairer option. \nThe output should not make up information and not reference these given instructions or context; only output the answer ("first" or "second").\n'}, {'role': 'user', 'content': 'First Option: We have an anonymous suggestion box that allows employees to express their concerns, providing a confidential outlet for resolving issues.\nSecond Option: Our organization encourages employees to participate in leadership development programs, enhancing their ability to interact with supervisors.'}, {'role': 'assistant', 'content': 'second'}]}
{'messages': [{'role': 'system', 'content': '\nRespondents compared two organizational policies (first_option and second_option) and voted on which 

In [8]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [9]:
training_file_name = "tmp_fairness_finetune_training.jsonl"
write_jsonl(training_json, training_file_name)

testing_file_name = "tmp_fairness_finetune_testing.jsonl"
write_jsonl(test_json, testing_file_name)

In [10]:
!head -n 5 tmp_fairness_finetune_training.jsonl

{"messages": [{"role": "system", "content": "\nRespondents compared two organizational policies (first_option and second_option) and voted on which they perceived as the fairest. \nGiven the first and second options, your task is to predict which option received the majority vote as the perceived fairer option. \nThe output should not make up information and not reference these given instructions or context; only output the answer (\"first\" or \"second\").\n"}, {"role": "user", "content": "First Option: We have an anonymous suggestion box that allows employees to express their concerns, providing a confidential outlet for resolving issues.\nSecond Option: Our organization encourages employees to participate in leadership development programs, enhancing their ability to interact with supervisors."}, {"role": "assistant", "content": "second"}]}
{"messages": [{"role": "system", "content": "\nRespondents compared two organizational policies (first_option and second_option) and voted on wh

### Pre-Tuning Checks

In [11]:
# Format error checks - Training set
with open("/content/tmp_fairness_finetune_testing.jsonl", 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [12]:
# Format error checks - Training set
with open("/content/tmp_fairness_finetune_training.jsonl", 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [13]:
# Warnings and tokens counts
with open("/content/tmp_fairness_finetune_testing.jsonl", 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 130, 142
mean / median: 133.2, 131.0
p5 / p95: 130.4, 138.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 1, 1
mean / median: 1.0, 1.0
p5 / p95: 1.0, 1.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [14]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print(f"Estimated training cost ~${((n_epochs * n_billing_tokens_in_dataset)/1000)*.0080}")

Dataset has ~666 tokens that will be charged for during training
By default, you'll train for 20 epochs on this dataset
By default, you'll be charged for ~13320 tokens
Estimated training cost ~$0.10656


In [15]:
with open(training_file_name, "rb") as training_fd:
    training_response = client.files.create(
        file=training_fd, purpose="fine-tune"
    )

training_file_id = training_response.id

with open(testing_file_name, "rb") as validation_fd:
    validation_response = client.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-nAW0mBN178wkrXc33PLvkneT
Validation file ID: file-advv813Rr3RQpiPViQm8w9rT


## Fine Tuning

In [20]:
# Only Run this cell when wanting to create a new fine-tuning job, otherwise you will be paying to redo work

#Uncomment the below code when wanting to run a new fine-tuning job
# response = client.fine_tuning.jobs.create(
#     training_file=training_file_id,
#     validation_file=validation_file_id,
#     model="gpt-3.5-turbo",
#     hyperparameters = {
#         "n_epochs":5,
#         "batch_size":"auto",
#         "learning_rate_multiplier":2
#         },
#     suffix="fairness_tuned_v3",
# )

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-eXdn7H8ACLnKczlAdntyMjgo
Status: validating_files


In [None]:
# WandbLogger.sync(fine_tune_job_id=job_id, project=WANDB_PROJECT, openai_client=client)

In [21]:
#Check Job Status
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print("Trained Tokens:", response.trained_tokens)


Job ID: ftjob-eXdn7H8ACLnKczlAdntyMjgo
Status: succeeded
Trained Tokens: 12345


In [22]:
#Track Fine-Tuning Endpoints
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Step 81/95: training loss=0.00, validation loss=0.00
Step 82/95: training loss=0.00, validation loss=0.00
Step 83/95: training loss=0.00, validation loss=0.00
Step 84/95: training loss=0.00, validation loss=5.84
Step 85/95: training loss=0.00, validation loss=0.00
Step 86/95: training loss=0.00, validation loss=0.00
Step 87/95: training loss=0.00, validation loss=0.00
Step 88/95: training loss=0.00, validation loss=0.00
Step 89/95: training loss=0.00, validation loss=5.73
Step 90/95: training loss=0.00, validation loss=0.00
Step 91/95: training loss=0.00, validation loss=0.00
Step 92/95: training loss=0.00, validation loss=0.00
Step 93/95: training loss=0.00, validation loss=0.00
Step 94/95: training loss=0.00, validation loss=5.73
Step 95/95: training loss=0.00, validation loss=0.00, full validation loss=0.63
New fine-tuned model created: ft:gpt-3.5-turbo-0125:personal:fairness-tuned-v3:9AQZOWrp
Checkpoint created at step 57 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:fairness-tu

In [23]:
# When job is done, run to fets fine-tuned model id
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

if fine_tuned_model_id is None:
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)

Fine-tuned model ID: ft:gpt-3.5-turbo-0125:personal:fairness-tuned-v3:9AQZOWrp


## Fine-Tuned Model Testing

In [24]:
df_fairness_dev = pd.read_csv("https://raw.githubusercontent.com/UrologyUnbound/SIOP_ML_2024_Discord/main/data/dev/fairness_val_public.csv")

In [25]:
test_row = df_fairness_dev.iloc[2]
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = create_user_message(test_row)
test_messages.append({"role": "user", "content": user_message})

pprint(test_messages)

[{'content': '\n'
             'Respondents compared two organizational policies (first_option '
             'and second_option) and voted on which they perceived as the '
             'fairest. \n'
             'Given the first and second options, your task is to predict '
             'which option received the majority vote as the perceived fairer '
             'option. \n'
             'The output should not make up information and not reference '
             'these given instructions or context; only output the answer '
             '("first" or "second").\n',
  'role': 'system'},
 {'content': 'First Option: Employee Support Groups: Peer-led groups offer '
             'support for those experiencing similar issues.\n'
             'Second Option: We encourage employees to use email communication '
             'to address issues with their supervisor, maintaining a paper '
             'trail.',
  'role': 'user'}]


In [26]:
response = client.chat.completions.create(
    model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=500
)
print(response.choices[0].message.content)

first
