In [2]:
# We import the API key and necessary variables 
import sys
sys.path.append("..")
import config, os, openai, tiktoken, json
import numpy as np

In [3]:
os.environ['OPENAI_API_KEY'] = config.OPENAI_API_KEY
os.environ['ORGANIZATION_ID'] = config.ORG_ID

In [14]:
with open('dataset_jobs_driver.txt', 'r') as file:
    content = file.read()

# Divide the dataset between questions and answers
conversations = content.strip().split('\nQ: ')

# Add "user:" y "assistant:" in each questions
formatted_content = ''
for conversation in conversations:
    parts = conversation.split('\nA: ')
    if len(parts) == 2:
        formatted_content += f"user: {parts[0]}\nassistant: {parts[1]}\n-\n"
    else:
        formatted_content += conversation + '\n'

# Save the new one
with open('dataset_jobs_driver_refactor.txt', 'w') as file:
    file.write(formatted_content)

In [15]:
# We check the dataset
with open('dataset_jobs_driver_refactor.txt', encoding='latin-1') as f:
    text = [line for line in f]

In [16]:
# We see how difference each example in the dataset
text[:10]

["user: Q: Will you continue running my campaign even if it doesn't reach the minimum target number of leads? \n",
 'assistant: Yes, we will continue running the campaign until we reach the minimum target number of leads, or we can issue a pro-rated credit.\n',
 '-\n',
 "user: Will you continue running my campaign even if it doesn't reach the minimum target number of leads? \n",
 "assistant: The LATS is a convenient tool for clients who don't have an ATS or prefer a more straightforward solution. It functions as a centralized hub where clients can view and manage all of their leads in one place, making it easy for them to track their job search efforts and follow up with interested candidates. With our LATS, clients can experience the benefits of an ATS without the added complexity, making the job search process more streamlined and efficient.\n",
 '-\n',
 "user: What is TheJobsDriver's client retention rate? \n",
 "assistant: The exact same thing. So, how do you stand out and get the 

We need to ensure that we follow the next format, according to Open IA documentation
```
{
  "messages": [
    { "role": "system", "content": "You are an assistant that occasionally misspells words" },
    { "role": "user", "content": "Tell me a story." },
    { "role": "assistant", "content": "One day a student went to schoool." }
  ]
}
```

In [7]:
# Function to refactor each example as the properly format
def refactor_example(example_list, system_message=None):
    messages = []

    # Include the general system message
    if system_message:
        messages.append({
            "role": "system",
            "content": system_message
        })

    # Iterate for each example
    for message in example_list:
        example = message.split(': ', maxsplit=1)

        #Control if each example dont follow the format
        if len(example) < 2:
            continue

        role = example[0].strip()
        content = example[1].strip()

        message_format = {
            "role": role,
            "content": content
        }

        messages.append(message_format)

    dict_final = {
        "messages": messages
    }

    return dict_final

In [17]:
# We apply the function in each example
system_message = "As a representative of TheJobsDriver, your role is to prioritize friendly, \
elevated conversation while anticipating that questioners may challenge your points. \
Approach these conversations like a game of chess, not checkers, in your responses emphasize generating high-quality leads while \
adhering to our proven processes. Highlight \
TheJobsDrivers distinctive approach and competitive edge. Emphasize delivering high-quality leads to clients \
through a single KPI, the number of interested job seekers."

dataset = []

example = []
for line in text:
  if line == '-\n':
    format_example = refactor_example(example_list=example,
                                            system_message=system_message)

    dataset.append(format_example)
    example = []
    continue

  example.append(line)

In [18]:
dataset[-1]

{'messages': [{'role': 'system',
   'content': 'As a representative of TheJobsDriver, your role is to prioritize friendly, elevated conversation while anticipating that questioners may challenge your points. Approach these conversations like a game of chess, not checkers, in your responses. Emphasize generating high-quality leads while adhering to our proven processes. Highlight TheJobsDrivers distinctive approach and competitive edge. Emphasize delivering high-quality leads to clients through a single KPI, the number of interested job seekers.'},
  {'role': 'user', 'content': 'What can TheJobsDriver for me?'},
  {'role': 'assistant',
   'content': 'is able to offer a comprehensive and effective solution for companies facing challenges in filling hard-to-find positions or experiencing high turnover with the help of a wide range of productive media sources for recruiting, including social networks, search engines, digital billboards, news sources and even programmatic audio to generate 

We check for errors and estimate the price using the guide [provided by OpenAI](https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset)

In [19]:
from collections import defaultdict
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
            print(message)

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1
            print(message)

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1
            print(message)

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1
            print(message)

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1
        print(message)

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


We estimate the number of tokens and a aproximate of the cost of this fine tunning, based on [OpenIA GitHub](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)

In [20]:
encoding = tiktoken.get_encoding("cl100k_base")
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"average / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [21]:
# Warnings and tokens counts by each type of user
n_messages, convo_lens, assistant_message_lens = [], [], []

for ex in dataset:
    messages = ex["messages"]
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print_distribution(n_messages, "Number of messages for example")
print_distribution(convo_lens, "Number of token for example")
print_distribution(assistant_message_lens, "Number of tokens by assistant for example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples that exceed the token limit of 4096, they will be truncated during fine-tuning.")


#### Distribution of Number of messages for example:
min / max: 3, 3
average / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of Number of token for example:
min / max: 127, 422
average / median: 185.95108695652175, 154.5
p5 / p95: 136.0, 281.40000000000003

#### Distribution of Number of tokens by assistant for example:
min / max: 11, 289
average / median: 66.1195652173913, 34.0
p5 / p95: 18.0, 157.40000000000003

0 examples that exceed the token limit of 4096, they will be truncated during fine-tuning.


In [22]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 4
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"The dataset have ~{n_billing_tokens_in_dataset} tokens to use in the training phase")
print(f"By default, we will use {n_epochs} epochs in this training")
print(f"So, will be ~{n_epochs * n_billing_tokens_in_dataset} tokens in total")

The dataset have ~34215 tokens to use in the training phase
By default, we will use 4 epochs in this training
So, will be ~136860 tokens in total


Taking into account this official pricing in [Open IA Fine Tunnig - Fees](https://openai.com/pricing)

| Model         | Training            | Input usage         | Output usage        |   
|---------------|---------------------|---------------------|---------------------|
| GPT-3.5 Turbo | $0.0080 / 1K tokens | $0.0120 / 1K tokens | $0.0160 / 1K tokens |

In [23]:
training_cost = '${:,.2f}'.format(((n_epochs * n_billing_tokens_in_dataset)/1000)*0.008)
print(f'The estimate cost of the training is ~{training_cost}')

The estimate cost of the training is ~$1.09


In [24]:
# We save the dataset in the properly format (json)
with open('jobs_driver_train_full.jsonl', 'w') as file:
    for ejemplo in dataset:
        json_line = json.dumps(ejemplo)
        file.write(json_line + '\n')

Fine Tunning Job in Open IA

In [25]:
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.organization = os.getenv("ORGANIZATION_ID")

In [26]:
# We save the dataset and we save the id
train_full_response_file = openai.File.create(
    file=open('jobs_driver_train_full.jsonl','rb'),
    purpose='fine-tune'
)


print(f'id: {train_full_response_file.id}')

id: file-QIqUQ2qXks5UOkKTA3sHS6Ca


In [30]:
with open('jobs_driver_train_full.jsonl', 'r') as file:
    a = file.read()

In [29]:
response = openai.FineTuningJob.create(training_file=train_full_response_file.id,
                                       model="gpt-3.5-turbo",
                                       suffix='jobs_driver_v2',
                                       hyperparameters={'n_epochs':4})

In [37]:
id_job = response.id
id_job

'ftjob-ixU6IoY60DwyPvzhLUz6CVbA'

In [53]:
response = openai.FineTuningJob.list_events(id=response.id)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])


Step 441/620: training loss=0.75
Step 451/620: training loss=0.83
Step 461/620: training loss=1.05
Step 471/620: training loss=0.89
Step 481/620: training loss=0.31
Step 491/620: training loss=0.42
Step 501/620: training loss=2.94
Step 511/620: training loss=0.45
Step 521/620: training loss=1.58
Step 531/620: training loss=0.06
Step 541/620: training loss=1.52
Step 551/620: training loss=1.41
Step 561/620: training loss=1.84
Step 571/620: training loss=0.49
Step 581/620: training loss=0.44
Step 591/620: training loss=0.85
Step 601/620: training loss=0.85
Step 611/620: training loss=0.66
New fine-tuned model created: ft:gpt-3.5-turbo-0613:thejobsdriver:english-teacher:8EgXprcU
The job has successfully completed
