In [1]:
!pip install -U openai

Collecting openai
  Downloading openai-1.33.0-py3-none-any.whl.metadata (21 kB)
Downloading openai-1.33.0-py3-none-any.whl (325 kB)
   ---------------------------------------- 0.0/325.5 kB ? eta -:--:--
   ---------- ----------------------------- 81.9/325.5 kB 2.3 MB/s eta 0:00:01
   ---------------------------------------- 325.5/325.5 kB 5.1 MB/s eta 0:00:00
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.30.3
    Uninstalling openai-1.30.3:
      Successfully uninstalled openai-1.30.3
Successfully installed openai-1.33.0


In [2]:
import pandas as pd
df = pd.read_csv("bank_support_train.csv")
df.head(5)

Unnamed: 0,Support Query,Top Category,Sub Category
0,Can you explain the monthly maintenance fee on...,Fees and Charges,Understanding Fees
1,"I was charged a fee for an ATM withdrawal, why?",Fees and Charges,Understanding Fees
2,How do I dispute a transaction fee I believe i...,Fees and Charges,Dispute Charges
3,Are there any fees for using online banking?,Fees and Charges,Understanding Fees
4,What are the charges for a wire transfer?,Fees and Charges,Understanding Fees


In [4]:
# Format Dataset

def convert_to_gpt35_format(dataset):
    fine_tuning_data = []
    for _, row in dataset.iterrows():
        json_response = '{"Top Category": "' + row['Top Category'] + '", "Sub Category": "' + row['Sub Category'] + '"}'
        fine_tuning_data.append({
            "messages": [
                {"role": "user", "content": row['Support Query']},
                {"role": "assistant", "content": json_response}
            ]
        })
    return fine_tuning_data

dataset = pd.read_csv('bank_support_train.csv')
converted_data = convert_to_gpt35_format(dataset)
converted_data[0]['messages']

[{'role': 'user',
  'content': 'Can you explain the monthly maintenance fee on my account?'},
 {'role': 'assistant',
  'content': '{"Top Category": "Fees and Charges", "Sub Category": "Understanding Fees"}'}]

In [5]:
import json
json.loads(converted_data[0]['messages'][-1]['content'])

{'Top Category': 'Fees and Charges', 'Sub Category': 'Understanding Fees'}

In [7]:
# Create Train and Val Set

from sklearn.model_selection import train_test_split

# Stratified splitting. Assuming 'Top Category' can be used for stratification
train_data, val_data = train_test_split(
    converted_data,
    test_size=0.2,
    stratify=dataset['Top Category'],
    random_state=42  # for reproducibility
)

In [8]:
type(train_data[0])

dict

In [9]:
# Create JSONL file

def write_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')


training_file_name = "train.jsonl"
validation_file_name = "val.jsonl"

write_to_jsonl(train_data, training_file_name)
write_to_jsonl(val_data, validation_file_name)

In [10]:
from openai import OpenAI
import os
client = OpenAI(api_key=os.environ['GPT_OPEN_AI_KEY'])

In [11]:
# Upload Training and Validation File

training_file = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
validation_file = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)

print("Training file id:", training_file.id)
print("Validation file id:", validation_file.id)

Training file id: file-3oWT7MzsmDW2KcucRpFCwn9v
Validation file id: file-Ai56SXBxMnDoVJNM0gsGBtXM


In [12]:
# Create Finetuning Job
suffix_name = "yt_tutorial"

response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    validation_file=validation_file.id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)
response

FineTuningJob(id='ftjob-XtE6rmWMjwCWl5BAdsKvZC2E', created_at=1718166812, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-UPSLssLNNasLkMhOlzKKCvfo', result_files=[], seed=1351744511, status='validating_files', trained_tokens=None, training_file='file-3oWT7MzsmDW2KcucRpFCwn9v', validation_file='file-Ai56SXBxMnDoVJNM0gsGBtXM', estimated_finish=None, integrations=[], user_provided_suffix='yt_tutorial')

In [15]:
# All Finetuning Jobs

client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-XtE6rmWMjwCWl5BAdsKvZC2E', created_at=1718166812, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-UPSLssLNNasLkMhOlzKKCvfo', result_files=[], seed=1351744511, status='running', trained_tokens=None, training_file='file-3oWT7MzsmDW2KcucRpFCwn9v', validation_file='file-Ai56SXBxMnDoVJNM0gsGBtXM', estimated_finish=1718167405, integrations=[], user_provided_suffix='yt_tutorial')], object='list', has_more=False)

In [17]:
# Retrieve Specific Job

response = client.fine_tuning.jobs.retrieve("ftjob-XtE6rmWMjwCWl5BAdsKvZC2E")
response

FineTuningJob(id='ftjob-XtE6rmWMjwCWl5BAdsKvZC2E', created_at=1718166812, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-UPSLssLNNasLkMhOlzKKCvfo', result_files=[], seed=1351744511, status='running', trained_tokens=None, training_file='file-3oWT7MzsmDW2KcucRpFCwn9v', validation_file='file-Ai56SXBxMnDoVJNM0gsGBtXM', estimated_finish=1718167402, integrations=[], user_provided_suffix='yt_tutorial')

In [18]:
fine_tuned_model_id = response.fine_tuned_model
print("\nFine-tuned model id:", fine_tuned_model_id)


Fine-tuned model id: None


In [None]:
# Test Finetuned Model
