In [13]:
import yaml
import sys, os
import pandas as pd

from IPython.display import Image, SVG

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from src.utils import *
from src.svg_quality_checks import *
from src.gpt_wrappers import *
from src.langchain_database import *

# show svg 
def show_svg(file):
    display(SVG(file))

# load yaml config
with open("../config.yml", "r") as f:
    config = yaml.safe_load(f)

In [2]:
import openai
import wandb

openai.api_key = config["OPENAI_KEY"]
os.environ["OPENAI_API_KEY"] = config["OPENAI_KEY"]

## Create an OpenAI dataset

In [3]:
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAIChat, OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

In [18]:
sources = create_sources_from_files(path = "../training_data_raw/", 
                                    description = "SVG for",
                                    openai_document = False
                                    )

dataset = []
for source in sources:
    question = \
    """Generate an SVG of {object_name} with the following style of {style}""".format(
        object_name = source['source'].split("/")[-1].split(".")[0],
        style = source['source'].split("/")[-2]
    )
    answer = source['svg']
    dataset.append({"prompt": question, "completion": answer})

dataset = pd.DataFrame(dataset)

In [19]:
dataset

Unnamed: 0,prompt,completion
0,Generate an SVG of input-text-disabled-lg with...,"<svg viewBox=""0.00,0.00,320.00,40.00""><g><rect..."
1,Generate an SVG of btn-danger-text-sm with the...,"<svg viewBox=""0.00,0.00,104.00,24.00""><g><rect..."
2,Generate an SVG of tootlip-top-right with the ...,"<svg viewBox=""0.00,0.00,166.00,68.00""><defs><f..."
3,Generate an SVG of tooltip-left-middle with th...,"<svg viewBox=""0.00,0.00,171.00,63.00""><defs><f..."
4,Generate an SVG of input-text-hover with the f...,"<svg viewBox=""0.00,0.00,320.00,32.00""><g><rect..."
...,...,...
655,Generate an SVG of Icon_Button_Filled_[Disable...,"<svg fill=""none"" viewBox=""0.00,0.00,48.00,48.0..."
656,Generate an SVG of Elevated_Filter_Chip_Off_[d...,"<svg fill=""none"" viewBox=""0.00,0.00,121.00,32...."
657,Generate an SVG of Avatar_Small_[disabled] wit...,"<svg fill=""none"" viewBox=""0.00,0.00,24.00,24.0..."
658,Generate an SVG of 3_Lines_Video_List_+_Contro...,"<svg fill=""none"" viewBox=""0.00,0.00,360.00,88...."


In [22]:
DATASET_FILE = "../data/dataset.jsonl"
dataset.to_json(DATASET_FILE, orient='records', lines=True)

In [21]:
!openai tools fine_tunes.prepare_data -f ../data/dataset.jsonl -q

Analyzing...

- Your file contains 660 prompt-completion pairs
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- All prompts start with prefix `Generate an SVG of `. Fine-tuning doesn't require the instruction specifying the task, or a few-shot example scenario. Most of the time you should only add the input data into the prompt, and the desired output into the completion
- All completions start with prefix `<svg `. Most of the time you should only add the output data into the completion, without any prefix
- All completions end with suffix `></g></svg>`. This suffix seems very long. Consider replacing with a shorter suffix, such as `\n

In [23]:
upload_response = openai.File.create(
  file=open(DATASET_FILE, "rb"),
  purpose='fine-tune'
)
file_id = upload_response.id
upload_response

<File file id=file-xC7FHjj0O0pf6JBejDZSifJk at 0x1583f8270> JSON: {
  "bytes": 1228254,
  "created_at": 1681988250,
  "filename": "file",
  "id": "file-xC7FHjj0O0pf6JBejDZSifJk",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

## Finetune the model

In [56]:
fine_tune_response = openai.FineTune.create(
    training_file=upload_response['id']
    ) # openai.FineTune.create(training_file=upload_response['id'], model="davinci")
fine_tune_response.events

[<OpenAIObject fine-tune-event at 0x1585aaf40> JSON: {
   "created_at": 1681988844,
   "level": "info",
   "message": "Created fine-tune: ft-6XOvgdZneEGlBPzhJNsXSH7V",
   "object": "fine-tune-event"
 }]

In [66]:
! openai api fine_tunes.list

{
  "data": [
    {
      "created_at": 1681988279,
      "fine_tuned_model": null,
      "hyperparams": {
        "batch_size": null,
        "learning_rate_multiplier": null,
        "n_epochs": 4,
        "prompt_loss_weight": 0.01
      },
      "id": "ft-ZBtyAGGRZgpM8LInVRJ9xHZk",
      "model": "curie",
      "object": "fine-tune",
      "organization_id": "org-4whisRJIhIJtFAsnpmbxhLSC",
      "result_files": [],
      "status": "cancelled",
      "training_files": [
        {
          "bytes": 1228254,
          "created_at": 1681988250,
          "filename": "file",
          "id": "file-xC7FHjj0O0pf6JBejDZSifJk",
          "object": "file",
          "purpose": "fine-tune",
          "status": "processed",
          "status_details": null
        }
      ],
      "updated_at": 1681988673,
      "validation_files": []
    },
    {
      "created_at": 1681988351,
      "fine_tuned_model": null,
      "hyperparams": {
        "batch_size": null,
        "learning_rate_multiplier