# Loading the dataset

In [1]:
import pandas as pd

In [2]:
recipe_df=pd.read_csv("cookbook_recipes_nlg_10k.csv")
recipe_df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,www.cookbooks.com,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,www.cookbooks.com,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,www.cookbooks.com,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,www.cookbooks.com,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,www.cookbooks.com,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [3]:
recipe_df.shape

(10000, 6)

In [4]:
recipe_df.head().T

Unnamed: 0,0,1,2,3,4
title,No-Bake Nut Cookies,Jewell Ball'S Chicken,Creamy Corn,Chicken Funny,Reeses Cups(Candy)
ingredients,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""1 small jar chipped beef, cut up"", ""4 boned ...","[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""1 c. peanut butter"", ""3/4 c. graham cracker ..."
directions,"[""In a heavy 2-quart saucepan, mix brown sugar...","[""Place chipped beef on bottom of baking dish....","[""In a slow cooker, combine all ingredients. C...","[""Boil and debone chicken."", ""Put bite size pi...","[""Combine first four ingredients and press in ..."
link,www.cookbooks.com/Recipe-Details.aspx?id=44874,www.cookbooks.com/Recipe-Details.aspx?id=699419,www.cookbooks.com/Recipe-Details.aspx?id=10570,www.cookbooks.com/Recipe-Details.aspx?id=897570,www.cookbooks.com/Recipe-Details.aspx?id=659239
source,www.cookbooks.com,www.cookbooks.com,www.cookbooks.com,www.cookbooks.com,www.cookbooks.com
NER,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[""beef"", ""chicken breasts"", ""cream of mushroom...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[""chicken"", ""chicken gravy"", ""cream of mushroo...","[""peanut butter"", ""graham cracker crumbs"", ""bu..."


# Creating messages

In [5]:
training_data = []
system_message = "You are a helpful recipe assistant.\
You are to extract the generix ingredients from each of the recipes provided."

def create_user_message(row):
    return f"""Title: {row['title']}\n\nIngredients: {row['ingredients']}\n\nGeneric ingredients:"""

def prepare_example_conversation(row):
    messages = []
    messages.append({"role":"system","content": system_message})
    user_message = create_user_message(row)
    messages.append({"role":"user", "content": user_message})
    messages.append({"role":"assistant", "content": row["NER"]})
    return {"messages": messages}

In [6]:
prepare_example_conversation(recipe_df.iloc[0])

{'messages': [{'role': 'system',
   'content': 'You are a helpful recipe assistant.You are to extract the generix ingredients from each of the recipes provided.'},
  {'role': 'user',
   'content': 'Title: No-Bake Nut Cookies\n\nIngredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients:'},
  {'role': 'assistant',
   'content': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}]}

In [7]:
from pprint import pprint

pprint(prepare_example_conversation(recipe_df.iloc[0]))

{'messages': [{'content': 'You are a helpful recipe assistant.You are to '
                          'extract the generix ingredients from each of the '
                          'recipes provided.',
               'role': 'system'},
              {'content': 'Title: No-Bake Nut Cookies\n'
                          '\n'
                          'Ingredients: ["1 c. firmly packed brown sugar", '
                          '"1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 '
                          'c. broken nuts (pecans)", "2 Tbsp. butter or '
                          'margarine", "3 1/2 c. bite size shredded rice '
                          'biscuits"]\n'
                          '\n'
                          'Generic ingredients:',
               'role': 'user'},
              {'content': '["brown sugar", "milk", "vanilla", "nuts", '
                          '"butter", "bite size shredded rice biscuits"]',
               'role': 'assistant'}]}


In [8]:
training_df = recipe_df.loc[0:16]

In [9]:
training_data = training_df.apply(
    prepare_example_conversation,axis=1).tolist()

In [10]:
for example in training_data[:3]:
    print(example)

{'messages': [{'role': 'system', 'content': 'You are a helpful recipe assistant.You are to extract the generix ingredients from each of the recipes provided.'}, {'role': 'user', 'content': 'Title: No-Bake Nut Cookies\n\nIngredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients:'}, {'role': 'assistant', 'content': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}]}
{'messages': [{'role': 'system', 'content': 'You are a helpful recipe assistant.You are to extract the generix ingredients from each of the recipes provided.'}, {'role': 'user', 'content': 'Title: Jewell Ball\'S Chicken\n\nIngredients: ["1 small jar chipped beef, cut up", "4 boned chicken breasts", "1 can cream of mushroom soup", "1 carton sour cream"]\n\nGeneric ingredients:'}, {'role': 'assistant', 'content': '["beef", 

In [11]:
validation_df = recipe_df.loc[17:20]
validation_data = validation_df.apply(
    prepare_example_conversation, axis=1
).tolist()

# Data preprocessing

In [12]:
import json

In [13]:
def write_json(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [14]:
training_file_name = "tmp_recipe_finetune_training.json"

write_json(training_data, training_file_name)

In [15]:
validation_file_name = "tmp_recipe_finetune_validation.json"

write_json(validation_data, validation_file_name)

# Uploading files to OpenAI

In [17]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [18]:
from openai import OpenAI 

client = OpenAI()

In [20]:
training_response = client.files.create(
    file = open(training_file_name, "rb"), 
    purpose = "fine-tune"
)

In [21]:
client.files.list()

SyncPage[FileObject](data=[FileObject(id='file-FXSWIsrh1jquruuBRmX8MW9i', bytes=9888, created_at=1705328259, filename='tmp_recipe_finetune_training.json', object='file', purpose='fine-tune', status='processed', status_details=None)], object='list', has_more=False)

In [23]:
training_file_id = training_response.id 
training_file_id 

'file-FXSWIsrh1jquruuBRmX8MW9i'

In [24]:
validation_response = client.files.create(
    file = open(validation_file_name, "rb"), 
    purpose = "fine-tune"
)

validation_file_id = validation_response.id 
validation_file_id

'file-sRzbZPfvf4QfgiJhy26XXItV'

In [25]:
client.files.list()

SyncPage[FileObject](data=[FileObject(id='file-sRzbZPfvf4QfgiJhy26XXItV', bytes=2444, created_at=1705328555, filename='tmp_recipe_finetune_validation.json', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-FXSWIsrh1jquruuBRmX8MW9i', bytes=9888, created_at=1705328259, filename='tmp_recipe_finetune_training.json', object='file', purpose='fine-tune', status='processed', status_details=None)], object='list', has_more=False)

# Fine-tuning

In [26]:
response = client.fine_tuning.jobs.create(
    training_file = training_file_id,
    validation_file = validation_file_id,
    model = "gpt-3.5-turbo",
    suffix = "recipe-ner",
    hyperparameters = {"n_epochs":2}
)

In [27]:
job_id = response.id 
print("Job ID: ", job_id)

Job ID:  ftjob-3DbDNybPdxEnHDKYsNBKMTkl


In [None]:
# client.fine_tuning.jobs.cancel("ftjob-abc123")

In [29]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID: ", response.id)
print("Status: ", response.status)
print("Trained Tokens: ", response.trained_tokens)

Job ID:  ftjob-3DbDNybPdxEnHDKYsNBKMTkl
Status:  running
Trained Tokens:  None


In [31]:
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data 
events.reverse()

for event in events:
    print(event.message)

Step 17/34: training loss=0.40, validation loss=0.03
Step 18/34: training loss=0.12, validation loss=0.32
Step 19/34: training loss=0.31, validation loss=0.17
Step 20/34: training loss=0.12, validation loss=0.00
Step 21/34: training loss=0.11, validation loss=0.02
Step 22/34: training loss=0.15, validation loss=0.30
Step 23/34: training loss=0.19, validation loss=0.17
Step 24/34: training loss=0.41, validation loss=0.00
Step 25/34: training loss=0.01, validation loss=0.02
Step 26/34: training loss=0.04, validation loss=0.27
Step 27/34: training loss=0.12, validation loss=0.19
Step 28/34: training loss=0.35, validation loss=0.00
Step 29/34: training loss=0.00, validation loss=0.02
Step 30/34: training loss=0.35, validation loss=0.25
Step 31/34: training loss=0.23, validation loss=0.20
Step 32/34: training loss=0.24, validation loss=0.00
Step 33/34: training loss=0.12, validation loss=0.02
Step 34/34: training loss=0.00, validation loss=0.24
New fine-tuned model created: ft:gpt-3.5-turbo

In [32]:
response = client.fine_tuning.jobs.retrieve(job_id)

fine_tuned_model_id = response.fine_tuned_model

print("Fine-tuned model ID: ", fine_tuned_model_id)

Fine-tuned model ID:  ft:gpt-3.5-turbo-0613:personal:recipe-ner:8hIVChxc


In [33]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID: ", response.id)
print("Status: ", response.status)
print("Trained Tokens: ", response.trained_tokens)

Job ID:  ftjob-3DbDNybPdxEnHDKYsNBKMTkl
Status:  succeeded
Trained Tokens:  4406


# Inference

In [34]:
fine_tuned_model_id

'ft:gpt-3.5-turbo-0613:personal:recipe-ner:8hIVChxc'

In [35]:
recipe_df.iloc[0]

title                                        No-Bake Nut Cookies
ingredients    ["1 c. firmly packed brown sugar", "1/2 c. eva...
directions     ["In a heavy 2-quart saucepan, mix brown sugar...
link              www.cookbooks.com/Recipe-Details.aspx?id=44874
source                                         www.cookbooks.com
NER            ["brown sugar", "milk", "vanilla", "nuts", "bu...
Name: 0, dtype: object

In [36]:
test_row = recipe_df.iloc[0]

In [37]:
system_message

'You are a helpful recipe assistant.You are to extract the generix ingredients from each of the recipes provided.'

In [38]:
test_messages = []

test_messages.append({"role":"system","content": system_message})
test_messages

[{'role': 'system',
  'content': 'You are a helpful recipe assistant.You are to extract the generix ingredients from each of the recipes provided.'}]

In [39]:
user_message = create_user_message(test_row)
print(user_message)

Title: No-Bake Nut Cookies

Ingredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]

Generic ingredients:


In [40]:
test_messages.append({"role":"user", "content":user_message})
test_messages

[{'role': 'system',
  'content': 'You are a helpful recipe assistant.You are to extract the generix ingredients from each of the recipes provided.'},
 {'role': 'user',
  'content': 'Title: No-Bake Nut Cookies\n\nIngredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients:'}]

In [41]:
pprint(test_messages)

[{'content': 'You are a helpful recipe assistant.You are to extract the '
             'generix ingredients from each of the recipes provided.',
  'role': 'system'},
 {'content': 'Title: No-Bake Nut Cookies\n'
             '\n'
             'Ingredients: ["1 c. firmly packed brown sugar", "1/2 c. '
             'evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts '
             '(pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size '
             'shredded rice biscuits"]\n'
             '\n'
             'Generic ingredients:',
  'role': 'user'}]


In [42]:
response = client.chat.completions.create(
    model = fine_tuned_model_id,
    messages = test_messages,
    temperature = 0,
    max_tokens = 200,
)

In [43]:
print(response.choices[0].message.content)

["brown sugar", "evaporated milk", "vanilla", "nuts", "butter", "rice biscuits"]


Let's connect [YouTube](http://youtube.com/tirendazacademy) | [Medium](http://tirendazacademy.medium.com) | [X](http://x.com/tirendazacademy) | [Linkedin](https://www.linkedin.com/in/tirendaz-academy) 😎