# Fine Tuning the GPT-4 model

In [66]:
!pip install openai



In [67]:
!pip install --upgrade openai



### Loading Data

In [3]:
# Libraries
import pandas as pd
import json
import os
import openai
from openai import OpenAI
import time
from pathlib import Path
from typing import Tuple, List, Dict
import tiktoken

In [None]:
# need to intialize client

In [70]:
# loading Data
from sklearn.datasets import fetch_20newsgroups

sports_data = fetch_20newsgroups(subset='train', shuffle=True, categories=['rec.sport.baseball', 'rec.sport.hockey'])

### Data exploration

In [71]:
print(sports_data['data'][0]) # .data gives the text

From: dougb@comm.mot.com (Doug Bank)
Subject: Re: Info needed for Cleveland tickets
Reply-To: dougb@ecs.comm.mot.com
Organization: Motorola Land Mobile Products Sector
Distribution: usa
Nntp-Posting-Host: 145.1.146.35
Lines: 17

In article <1993Apr1.234031.4950@leland.Stanford.EDU>, bohnert@leland.Stanford.EDU (matthew bohnert) writes:

|> I'm going to be in Cleveland Thursday, April 15 to Sunday, April 18.
|> Does anybody know if the Tribe will be in town on those dates, and
|> if so, who're they playing and if tickets are available?

The tribe will be in town from April 16 to the 19th.
There are ALWAYS tickets available! (Though they are playing Toronto,
and many Toronto fans make the trip to Cleveland as it is easier to
get tickets in Cleveland than in Toronto.  Either way, I seriously
doubt they will sell out until the end of the season.)

-- 
Doug Bank                       Private Systems Division
dougb@ecs.comm.mot.com          Motorola Communications Sector
dougb@nwu.edu       

In [72]:
sports_data.target # .target lables

array([0, 1, 0, ..., 0, 0, 1])

In [73]:
sports_data.target_names # .target_name gives categories

['rec.sport.baseball', 'rec.sport.hockey']

In [74]:
# Length
len_all,len_baseball,len_hockey = len(sports_data['data']),len([e for e in sports_data.target if e==0]),len([e for e in sports_data.target if e == 1])
print(f'Total examples: {len_all}, Baseball examples: {len_baseball}, Hockey examples: {len_hockey}')

Total examples: 1197, Baseball examples: 597, Hockey examples: 600


### Data preparation

we transform the dataset into dataframes i.e, clouumns for prompt and completion [prompt - email from mailing list] [completion - name of sport] . Here we are taking 300 samples for speed of fine tune

In [75]:
# create prompt and completion pairs
def create_prompt_completion_pairs(data):
    """
    Convert raw newsgroup data into prompt-completion format.

    For text classification:
    - Prompt: The text to classify
    - Completion: The category/label

    Returns:
    --------
    DataFrame with 'prompt' and 'completion' columns
    """
    prompts = []
    completions = []

    for text, x in zip(data.data, data.target):
        # Clean and truncate text if too long (GPT-3 has token limits)
        text = text.strip()[:2000]  # Limit to ~2000 characters

        # Create prompt (the input we'll give to the model)
        # Add clear instruction and formatting
        prompt = f"Classify the following text into a category:\n\n{text}\n\n:"

        # Create completion (the expected output)
        # OpenAI recommends adding a space before the completion and \n at the end
        completion = f" {data.target_names[x].split('.')[-1]}"

        prompts.append(prompt)
        completions.append(completion)

    df = pd.DataFrame({
        'prompt': prompts,
        'completion': completions
    })

    print(f"\nCreated {len(df)} prompt-completion pairs")
    print(f"\nSample prompt-completion pair:")
    print(f"Prompt: {df.iloc[0]['prompt'][:200]}...")
    print(f"Completion: {df.iloc[0]['completion']}")

    return df

In [76]:
sports_df = create_prompt_completion_pairs(sports_data)


Created 1197 prompt-completion pairs

Sample prompt-completion pair:
Prompt: Classify the following text into a category:

From: dougb@comm.mot.com (Doug Bank)
Subject: Re: Info needed for Cleveland tickets
Reply-To: dougb@ecs.comm.mot.com
Organization: Motorola Land Mobile Pr...
Completion:  baseball


In [77]:
# create training data
from sklearn.model_selection import train_test_split

def split_train_validation(df, test_size=0.2,random_state=42):
    """
    Split dataframe into training and validation sets.
    80% train, 20% validation.
    Parameters:
    -----------
    df : DataFrame
        DataFrame with 'prompt' and 'completion' columns
    test_size : float
        Proportion of data to use for validation (0.1 to 0.2 is typical)

    Returns:
    --------
    train_df, val_df : DataFrames
    """
    train_df, val_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        shuffle=True
    )

    print(f"\nTraining set size: {len(train_df)} samples")
    print(f"Validation set size: {len(val_df)} samples")
    print(f"Split ratio: {(1-test_size)*100:.0f}% train, {test_size*100:.0f}% validation")

    return train_df, val_df



In [78]:
#using the above function and split the data
train_df, val_df = split_train_validation(sports_df, test_size=0.2)


Training set size: 957 samples
Validation set size: 240 samples
Split ratio: 80% train, 20% validation


In [79]:
# save dataset as json file
# sports_df.to_json("dataset.json",orient = 'records',lines=True)

In [80]:
# prepare jsonl files
# using chunks for fast and for large datasets
def write_in_chunks(df, filename,chunk_size:int=500):
    total_rows = len(df)
    """
    Convert DataFrame to JSONL format required by OpenAI.

    Each line in JSONL file must be a JSON object with:
    - "messages": list of message objects with "role" and "content"

    For fine-tuning, we use the chat format:
    - System message (optional): Instructions for the model
    - User message: The prompt
    - Assistant message: The expected completion

    Parameters:
    -----------
    df : DataFrame
        DataFrame with 'prompt' and 'completion' columns
    filename : str
        Output filename for JSONL file
    Returns:
    --------
    filename : str
        Output filename for JSONL file
    """

    with open(filename, 'w', encoding='utf-8') as f:
        for i in range(0, total_rows, chunk_size):
          end = min(i+chunk_size,total_rows)
          chunk = df.iloc[i:i+chunk_size]

          print(f"processing rows : {i} to {end}")
          for index, row in chunk.iterrows():
            # Create the message format for chat models
            example = {
                "messages": [
                    {"role": "system","content": "You are a text classifier that categorizes documents into specific categories."},
                    {"role": "user", "content": row['prompt']},
                    {"role": "assistant","content": row['completion']}
                ]
            }

            # Write as single line JSON
            f.write(json.dumps(example) + '\n')

    print(f"\nCreated {filename}")
    print(f"Number of examples: {len(df)}")


In [81]:
# Create JSONL files
train_file_path = write_in_chunks(train_df, 'train_data_chunks.jsonl')
val_file_path = write_in_chunks(val_df, 'validation_data_chunks.jsonl')

processing rows : 0 to 500
processing rows : 500 to 957

Created train_data_chunks.jsonl
Number of examples: 957
processing rows : 0 to 240

Created validation_data_chunks.jsonl
Number of examples: 240


### Data preparation tool

This tool improve the dataset and split the data into training and validation set.
Imrovpements Like:
**suffix sepeartor** - suffix between the prompt and completion tells the model that input text is stopped and predict the class.
Here since we use same seperater the model is able to learn that it is meant to predict (baseball or Hockey)
**whitespace** - tokens are tokenized in a space prefix
**Here** it recognise as classification and prepare training and validation set

In [90]:
# upload files to openai
file_response = client.files.create(
    file=open("/content/train_data_chunks.jsonl", "rb"),
    purpose="fine-tune"
)

file_response

FileObject(id='file-YaHs7G6Pk6J8FkdnTRoq1y', bytes=1429811, created_at=1765084226, filename='train_data_chunks.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [91]:
file_response_valid= client.files.create(
    file=open("/content/validation_data_chunks.jsonl", "rb"),
    purpose="fine-tune"
)

file_response_valid

FileObject(id='file-6Epbyz9NVWdfaMtqmiK5d5', bytes=360049, created_at=1765084227, filename='validation_data_chunks.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [92]:
# creating the fine tuning job
fine_tune_response = client.fine_tuning.jobs.create(
    training_file=file_response.id,
    validation_file=file_response_valid.id,
    model="gpt-4.1-2025-04-14",
    hyperparameters={
        "n_epochs": 1,
        "batch_size" :16,
        "learning_rate_multiplier": 0.8,
    }
)

In [93]:
fine_tune_response

FineTuningJob(id='ftjob-DbWPLJtGNgQ6jyo47z3kpFVU', created_at=1765084230, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=16, learning_rate_multiplier=0.8, n_epochs=1), model='gpt-4.1-2025-04-14', object='fine_tuning.job', organization_id='org-XobQsGwDmCeEhaPAMiI7Q7cF', result_files=[], seed=1066803071, status='validating_files', trained_tokens=None, training_file='file-YaHs7G6Pk6J8FkdnTRoq1y', validation_file='file-6Epbyz9NVWdfaMtqmiK5d5', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=16, learning_rate_multiplier=0.8, n_epochs=1))), user_provided_suffix=None, usage_metrics=None, shared_with_openai=False, eval_id=None)

In [6]:
job = client.fine_tuning.jobs.retrieve("ftjob-DbWPLJtGNgQ6jyo47z3kpFVU",)

In [7]:
job

FineTuningJob(id='ftjob-DbWPLJtGNgQ6jyo47z3kpFVU', created_at=1765084230, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4.1-2025-04-14:sai-kesana::Ck1vuup9', finished_at=1765086552, hyperparameters=Hyperparameters(batch_size=16, learning_rate_multiplier=0.8, n_epochs=1), model='gpt-4.1-2025-04-14', object='fine_tuning.job', organization_id='org-XobQsGwDmCeEhaPAMiI7Q7cF', result_files=['file-9iiyqpPprZFRpxF8M6p7UH'], seed=1066803071, status='succeeded', trained_tokens=355531, training_file='file-YaHs7G6Pk6J8FkdnTRoq1y', validation_file='file-6Epbyz9NVWdfaMtqmiK5d5', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=16, learning_rate_multiplier=0.8, n_epochs=1))), user_provided_suffix=None, usage_metrics=None, shared_with_openai=False, eval_id=None)

In [13]:
result = client.fine_tuning.jobs.list_events("ftjob-DbWPLJtGNgQ6jyo47z3kpFVU")
print(result)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-4prdxCbMgal5KlpX9GuKi8Jt', created_at=1765087323, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-XkaYlIcGaOyNHmMIY7PiMQlI', created_at=1765087318, level='info', message='Usage policy evaluations completed, model is now enabled for sampling', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-KAHJg6Ayv5mjuuXypMsHcqbn', created_at=1765087318, level='info', message='Moderation checks for snapshot ft:gpt-4.1-2025-04-14:sai-kesana::Ck1vuup9 passed.', object='fine_tuning.job.event', data={'blocked': False, 'results': [{'flagged': False, 'category': 'harassment/threatening', 'enforcement': 'blocking'}, {'flagged': False, 'category': 'sexual', 'enforcement': 'blocking'}, {'flagged': False, 'category': 'sexual/minors', 'enforcement': 'blocking'}, {'flagged': False, 'category': 'propagan

In [None]:
save = pd.read_csv()

In [15]:
import pandas as pd

# Extract event data from the result object
events_data = []
for event in result.data:
    event_dict = event.model_dump() # Convert Pydantic model to dictionary
    events_data.append(event_dict)

# Create a DataFrame from the events data
events_df = pd.DataFrame(events_data)

# Save the DataFrame to a CSV file
csv_filename = "fine_tuning_events.csv"
events_df.to_csv(csv_filename, index=False)

print(f"Fine-tuning job events saved to {csv_filename}")

# Display the first few rows of the DataFrame
display(events_df.head())

Fine-tuning job events saved to fine_tuning_events.csv


Unnamed: 0,id,created_at,level,message,object,data,type
0,ftevent-4prdxCbMgal5KlpX9GuKi8Jt,1765087323,info,The job has successfully completed,fine_tuning.job.event,{},message
1,ftevent-XkaYlIcGaOyNHmMIY7PiMQlI,1765087318,info,"Usage policy evaluations completed, model is n...",fine_tuning.job.event,{},message
2,ftevent-KAHJg6Ayv5mjuuXypMsHcqbn,1765087318,info,Moderation checks for snapshot ft:gpt-4.1-2025...,fine_tuning.job.event,"{'blocked': False, 'results': [{'flagged': Fal...",moderation_checks
3,ftevent-mIcOXtVIIMLXobsgmBxdTt7D,1765086555,info,Evaluating model against our usage policies,fine_tuning.job.event,{},message
4,ftevent-7SYIgY0HuyfrMdmCAc2qSRcj,1765086555,info,New fine-tuned model created,fine_tuning.job.event,{},message
