# Fine Tuning OpenAI model

### Importing Libraries and configuration

In [22]:
import os 
from openai import OpenAI 
from dotenv import load_dotenv
import json
from time import sleep
load_dotenv("../apikey.env")
#Set up OpenAI API client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
print("All configuration loaded")

All configuration loaded


### To list out files present in the OpenAI API

In [3]:
files = client.files.list()
# Print the list of files
for file in files.data[0:5]:
    print(f"File ID: {file.id}, File Name: {file.filename}, File Purpose: {file.purpose}")

File ID: file-JRDufToYrzP5ctcpeMtshh, File Name: data_eval_openai_v2.jsonl, File Purpose: fine-tune
File ID: file-2QLGfsoobRG7Kue3zeTQDU, File Name: data_training_openai_v2.jsonl, File Purpose: fine-tune
File ID: file-KZDxqUrJsrZhRLetWPGB5F, File Name: step_metrics.csv, File Purpose: fine-tune-results
File ID: file-MJDTLXTPf6QAmxQ5WZ7dXi, File Name: data_eval_openai.jsonl, File Purpose: fine-tune
File ID: file-823bMAAvCHFVPdWjWX2ta1, File Name: data_training_openai.jsonl, File Purpose: fine-tune


### Deleting Files 

In [39]:
# To delete files that were previously uploaded
json_file_path = "file_id.json"
if os.path.exists(json_file_path):
    with open(json_file_path, 'r') as file:
        file_ids = json.load(file)
        if not file_ids:
            print("No file IDs found in the JSON file.")
    training_file_id = file_ids.get("training_file_id")
    validation_file_id = file_ids.get("validation_file_id")
    
    if training_file_id:
        client.files.delete(training_file_id)
        print(f"Deleted training file with ID: {training_file_id}")
    
    if validation_file_id:
        client.files.delete(validation_file_id)
        print(f"Deleted validation file with ID: {validation_file_id}")
    with open("file_id.json", 'w') as file:
        json.dump({"training_file_id":None, "validation_file_id":None}, file)
else:
    print(f"JSON file {json_file_path} does not exist. No files to delete.")

Deleted training file with ID: file-XtLAPa32S1wiFteeJnGpHt
Deleted validation file with ID: file-9LHN5VeXZisbio9VDbDTb5


### Uploading Files

In [None]:
# Upload files
training_file_name = "data_training_openai.jsonl"
validation_file_name = "data_eval_openai.jsonl"

training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

validation_file_id = client.files.create(
  file=open(validation_file_name, "rb"),
  purpose="fine-tune"
)

print("Files uploaded successfully, Processing...")

def wait_for_file_processing(file_id, file_type):
    while True:
        file = client.files.retrieve(file_id)
        
        if file.status == "processed":
            print(f"\r{' ' * 80}", end='')  # Clear the line
            print(f"\r✅ {file_type} file processed successfully! (ID: {file.id})")
            return file
        elif file.status == "error":
            print(f"\r{' ' * 80}", end='')  # Clear the line
            print(f"\r❌ {file_type} file processing failed!")
            return None
        else:
            print(f"\r⏳ {file_type} file status: {file.status}, waiting...", end='', flush=True)
            sleep(5)

# Wait for both files to be processed
training_file = wait_for_file_processing(training_file_id.id, "Training")
validation_file = wait_for_file_processing(validation_file_id.id, "Validation")

if training_file and validation_file:
    print("Both files processed successfully!")
    
    # Save the file IDs to a JSON file
    file_ids = {"training_file_id": training_file_id.id, "validation_file_id": validation_file_id.id}
    with open("file_id.json", "w") as file:
        json.dump(file_ids, file)
        print("File IDs saved to file_id.json")
else:
    print("File processing failed!")

Files uploaded successfully, Processing...
✅ Training file processed successfully! (ID: file-2QLGfsoobRG7Kue3zeTQDU)       
✅ Validation file processed successfully! (ID: file-JRDufToYrzP5ctcpeMtshh)     
Both files processed successfully!
File IDs saved to file_id.json


### Accessing Previously uploaded files

In [4]:
json_file_path = "file_id.json"
if os.path.exists(json_file_path):
    with open(json_file_path, 'r') as file:
        file_ids = json.load(file)
        if not file_ids:
            print("No file IDs found in the JSON file.")
    
    training_file_id = file_ids.get("training_file_id")
    validation_file_id = file_ids.get("validation_file_id")
    if not training_file_id or not validation_file_id:
        print("Training or validation file ID is missing in the JSON file.")
    else:
        try:
            training_file = client.files.retrieve(training_file_id) 
            validation_file = client.files.retrieve(validation_file_id)
        
            print(f"Training File ID: {training_file.id}")
            print(f"Validation File ID: {validation_file.id}")
            print(f"Training File Status: {training_file.status}")
            print(f"Validation File Status: {validation_file.status}")
        except Exception as e:
            print(f"An error occurred while retrieving file status:")
            print("Response Error Code:", e.response.status_code)
            print("Response Error Message:", e.response.text)
else:
    print(f"JSON file {json_file_path} does not exist. No files to retrieve.")

Training File ID: file-2QLGfsoobRG7Kue3zeTQDU
Validation File ID: file-JRDufToYrzP5ctcpeMtshh
Training File Status: processed
Validation File Status: processed


### Creating Fine Tuning job

In [5]:
from time import sleep, time

response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    validation_file=validation_file.id,
    model="gpt-4.1-mini-2025-04-14",
    hyperparameters={
        "n_epochs": 4,
        "batch_size": 1,
        "learning_rate_multiplier": 0.1
    }
)

job_id = response.id
print(f"Fine-tuning job created with ID: {job_id}")

start_time = time()

def format_elapsed_time(elapsed_seconds):
    hours = int(elapsed_seconds // 3600)
    minutes = int((elapsed_seconds % 3600) // 60)
    seconds = int(elapsed_seconds % 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

while True:
    response = client.fine_tuning.jobs.retrieve(job_id)
    status = response.status
    
    elapsed_time = time() - start_time
    time_str = format_elapsed_time(elapsed_time)

    if status == "succeeded":
        print(f"\r{' ' * 100}", end='')  # Clear the line
        print(f"\r✅ Fine-tuning completed successfully!")
        print(f"Fine-tuned model: {response.fine_tuned_model}")
        print(f"Total time elapsed: {time_str}")
        break
    elif status == "failed":
        print(f"\r{' ' * 100}", end='')  # Clear the line
        print(f"\r❌ Fine-tuning failed.")
        print(f"Time elapsed: {time_str}")
        break
    elif status == "cancelled":
        print(f"\r{' ' * 100}", end='')  # Clear the line
        print(f"\r❌ Fine-tuning was cancelled.")
        print(f"Time elapsed: {time_str}")
        break
    else:
        print(f"\r⏳ Fine-tuning in progress... Status: {status} | Elapsed: {time_str}", end='', flush=True)
        sleep(10)

Fine-tuning job created with ID: ftjob-XvGo1qSBi20m0vpLTMR5HkdC
✅ Fine-tuning completed successfully!                                                               
Fine-tuned model: ft:gpt-4.1-mini-2025-04-14:greatify::BrMVNxDG
Total time elapsed: 00:23:34


### Cancelling a Job

In [30]:
# Cancel the fine-tuning job
cancel_response = client.fine_tuning.jobs.cancel(job_id)
print(f"Fine-tuning job with ID {job_id} has been cancelled.")

Fine-tuning job with ID ftjob-Lmd1QZYnleDuEDr7ZGlJHgZP has been cancelled.


### Getting Job Details

In [6]:
# Get detailed job information including error details
response = client.fine_tuning.jobs.retrieve(job_id)
print(f"Job ID: {response.id}")

if response.status == "failed":
    print("Fine-tuning failed.")
    if response.error:
        print(f"Error code: {response.error.code}")
        print(f"Error message: {response.error.message}")
    else:
        print("No specific error details available.")
elif response.status == "succeeded":
    print("Fine-tuning completed successfully!")
    print(f"Fine-tuned model: {response.fine_tuned_model}")
    fine_tuned_model = response.fine_tuned_model
elif response.status == "cancelled":
    print("Fine-tuning was cancelled.")
else:
    print(f"Job status: {response.status}")

Job ID: ftjob-XvGo1qSBi20m0vpLTMR5HkdC
Fine-tuning completed successfully!
Fine-tuned model: ft:gpt-4.1-mini-2025-04-14:greatify::BrMVNxDG


### Specifics of the complete Fine Tuning Job

In [9]:

from datetime import datetime
print(f"Fine-tuned Model ID   : {response.fine_tuned_model}")
print(f"Base Model            : {response.model}")
print(f"Status                : {response.status}")
print(f"Training File ID      : {response.training_file}")
print(f"Validation File ID    : {response.validation_file}")
print(f"Number of Epochs      : {response.hyperparameters.n_epochs}")
print(f"Batch Size            : {response.hyperparameters.batch_size}")
print(f"Learning Rate Mult.   : {response.hyperparameters.learning_rate_multiplier}")
print(f"Created At            : {datetime.fromtimestamp(response.created_at).strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Completed At          : {datetime.fromtimestamp(response.finished_at).strftime('%Y-%m-%d %H:%M:%S')}")

Fine-tuned Model ID   : ft:gpt-4.1-mini-2025-04-14:greatify::BrMVNxDG
Base Model            : gpt-4.1-mini-2025-04-14
Status                : succeeded
Training File ID      : file-2QLGfsoobRG7Kue3zeTQDU
Validation File ID    : file-JRDufToYrzP5ctcpeMtshh
Number of Epochs      : 4
Batch Size            : 1
Learning Rate Mult.   : 0.1
Created At            : 2025-07-09 15:57:05
Completed At          : 2025-07-09 16:09:51


### Inferencing the Fine Tuned Model

In [31]:
import json

message_data =  {
    "question": "How does development of suburbs helps to solve the problem of housing in metropolitan cities?",
    "marks": 2,
    "rubric": "Rubrics not provided",
    "answer": "Suburbs are built outside cities, and they help with housing because people can live there instead of in the crowded main city. It is more peaceful and less crowded, so people like it. This helps solve the problem of space in the city.",
    "difficulty": "hard"
}
message_content = json.dumps(message_data)

response = client.chat.completions.create(
    model="ft:gpt-4.1-mini-2025-04-14:greatify::BrMVNxDG",
    messages=[
        {
            "role": "system", 
            "content": "You are an expert answer evaluator. Your job is to evaluate student answers fairly based on a flexible rubric and the specified difficulty level.\n\nInstructions:\n1. Return the score out of the total marks under the \"Score\" key.\n2. Give a brief explanation justifying the score, referencing key points from the rubric under the \"Explanation\" key.\n3. Suggest at least one specific way the student can improve their answer quality or overall academic performance as feedback.\n4. Use the rubric as a guideline, not a rigid checklist and if Rubric is not provided, then use your very own rubrics.\n5. Adjust the strictness of grading based on difficulty:\n   - 'easy' → lenient evaluation; minor issues can be overlooked.\n   - 'medium' → balanced and reasonable evaluation.\n   - 'hard' → stricter evaluation; all points must be well explained and accurate."
        },
        {
            "role": "user", 
            "content": message_content
        }
    ]
)
print(response.choices[0].message.content)

{"Score": "1/2", "Explanation": "The answer lacks formal content and misses key points like affordability, decongestion, or infrastructure development, which are expected in a hard-level answer.", "Feedback": "Explain how suburbs create planned communities or reduce pressure on city services using technical language and examples."}
