## GMR Fine-tuning

<pre>
1. Make a dataframe (from excel,csv) having the query
2. Convert the dataframe to gpt35_format, which is a JSON.
    eg: {messages:[{role:"" , content: ""}, {role:"", content: ""}]
3. Do STRATIFIED train_test_split on the gpt35 formatted data
4. Convert train and Val set -> JSONL format
5. Upload train and validation file using -> client.files.create()
6. Create Fine-Tuning Job using -> client.fine_tuning.jobs.create() and
execute it and monitor the training status for completion.
use -> client.fine_tuning.jobs.list(limit = 2).
Once completed , the model is ready to use.
Can get model name from the output of above function with parameter name "fine_tuned_model

In [None]:
!pip install -U openai

In [2]:
your_open_ai_key = "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" # gpt 3.5 turbo 1106 works wtih this key

In [4]:
import json
import pandas as pd
df = pd.read_csv("/content/GMR-Fine-tuning-DATA-utf.csv")
df.head()

Unnamed: 0,Customer Query,Intent,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,How much baggage am I allowed to carry?,Baggage,,,,,,,
1,What if I exceed my free baggage weight allowa...,Baggage,,,,,,,
2,What items are not permitted on flight?,Baggage,,,,,,,
3,What items are considered as special baggage a...,Baggage,,,,,,,
4,How much luggage is allowed on flights?,Baggage,,,,,,,


In [5]:
# Delete all unnamed columns
df = df.loc[:, ~df.columns.str.startswith('Unnamed: ')]
df.head()

Unnamed: 0,Customer Query,Intent
0,How much baggage am I allowed to carry?,Baggage
1,What if I exceed my free baggage weight allowa...,Baggage
2,What items are not permitted on flight?,Baggage
3,What items are considered as special baggage a...,Baggage
4,How much luggage is allowed on flights?,Baggage


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Customer Query  121 non-null    object
 1   Intent          121 non-null    object
dtypes: object(2)
memory usage: 2.0+ KB


In [7]:
df['Intent'].value_counts()

Baggage    45
Parking    33
Bus        23
Flight     20
Name: Intent, dtype: int64

In [8]:
dataset = df

In [9]:
def convert_to_gpt35_format(dataset):
    fine_tuning_data = []
    for _, row in dataset.iterrows():
        json_response = '{"Intent": "' + row['Intent'] + '"}'
        fine_tuning_data.append({
            "messages": [
                {"role": "user", "content": row['Customer Query']},
                {"role": "assistant", "content": row['Intent'] +' '}     # can use END to indicate end of the content
            ]
        })
    return fine_tuning_data
converted_data = convert_to_gpt35_format(dataset)

In [10]:
converted_data[0]

{'messages': [{'role': 'user',
   'content': 'How much baggage am I allowed to carry?'},
  {'role': 'assistant', 'content': 'Baggage '}]}

In [11]:
from sklearn.model_selection import train_test_split

# Stratified splitting. Assuming 'Top Category' can be used for stratification
train_data, val_data = train_test_split(
    converted_data,
    test_size=0.2,
    stratify=dataset['Intent'],
    random_state=42  # for reproducibility
)

In [12]:
# Making JSON to JSON Lines for fine tuning
def write_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')

training_file_name = "train.jsonl"
validation_file_name = "val.jsonl"

write_to_jsonl(train_data, training_file_name)
write_to_jsonl(val_data, validation_file_name)


<pre><b>Estimate costs<b>
Please refer to the pricing page for details on cost per 1k input
 and output tokens (we do to charge for tokens that are part of the validation data). To estimate the costs for a specific
 fine-tuning job, use the following formula:

base cost per 1k tokens * number of tokens in the input file * number of epochs trained

For a training file with 100,000 tokens trained over 3 epochs,
the expected cost would be ~$2.40 USD.

<b>
0.0080$ X 1260 X 3
For training file = 1260 tokens , price fine-tuning = 0.024 Dollars

In [13]:
from openai import OpenAI
client = OpenAI(api_key=your_open_ai_key)

# Upload Training and Validation Files
training_file = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
validation_file = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)

# Create Fine-Tuning Job
suffix_name = "gmr_intent"
response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    validation_file=validation_file.id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

In [26]:
response

FineTuningJob(id='ftjob-gsAGjhwUHNCf7aeg1NJC5znN', created_at=1705054755, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-DIrIHSX189AR148czuV557NT', result_files=[], status='validating_files', trained_tokens=None, training_file='file-X1al5tOGArnZs9oECUiMXkmw', validation_file='file-03YhdVW10fBGnnFStMkRhy2p')

In [25]:
client.fine_tuning.jobs.list(limit = 2)  #----------- Listing the fine-tuning jobs ------------------

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-gsAGjhwUHNCf7aeg1NJC5znN', created_at=1705054755, error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:arthink-ai:gmr-intent:8g961QBn', finished_at=1705055433, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-DIrIHSX189AR148czuV557NT', result_files=['file-H6YLYYCylaLsw7Vpy9NstrtH'], status='succeeded', trained_tokens=6714, training_file='file-X1al5tOGArnZs9oECUiMXkmw', validation_file='file-03YhdVW10fBGnnFStMkRhy2p'), FineTuningJob(id='ftjob-p7Rkmof6nuBh2GbmvbzN1UYF', created_at=1705054187, error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:arthink-ai:gmr-intent:8g8whLAD', finished_at=1705054854, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-DIrIHSX189AR148czuV557NT', result_files=['file-N4Sev3OWsYwqLM

In [30]:
fine_tuned_model_id = response.fine_tuned_model
print(fine_tuned_model_id)

None


In [None]:
fine_tuned_model_id = 'ft:gpt-3.5-turbo-0613:arthink-ai:gmr-intent:8g7vS2C5' # first wrong output was given by model due to not including any "assistant" message

In [27]:
fine_tuned_model_id = 'ft:gpt-3.5-turbo-0613:arthink-ai:gmr-intent:8g961QBn'

In [28]:
#-------Testing the Fine-tuned Model-------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def format_test(row):
    formatted_message = [{"role": "user", "content": row['Customer Query']}]
    return formatted_message

def predict(test_messages, fine_tuned_model_id):
    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=50
    )
    return response.choices[0].message.content

def store_predictions(test_df, fine_tuned_model_id):
    test_df['Prediction'] = None
    for index, row in test_df.iterrows():
        test_message = format_test(row)
        prediction_result = predict(test_message, fine_tuned_model_id)
        test_df.at[index, 'Prediction'] = prediction_result

    test_df.to_csv("predictions.csv")


In [29]:
test_df = pd.read_csv("/content/GMR-Fine-tuning-Test.csv")
store_predictions(test_df,fine_tuned_model_id)