### Generate training data

In [2]:
import pandas as pd
import json
from ground_truth_aiopoke import PokemonGroundTruth

pokemon_names_path = "data/pokemon_names.json"
train_save_path = "data/finetune_data.csv"
train_data = await PokemonGroundTruth().generate_data(pokemon_names_path)
train_data.to_csv(train_save_path, index=False)
print(f"Data saved to {train_save_path}")

Appended Pokemon chuck: ['ferrothorn']
Appended Pokemon chuck: ['porygon', 'gothorita', 'buzzwole']
Appended Pokemon chuck: ['mamoswine', 'musharna', 'gastrodon', 'bunnelby']
Appended Pokemon chuck: ['uxie', 'wigglytuff', 'cinccino']
Appended Pokemon chuck: ['claydol']
Appended Pokemon chuck: ['spidops', 'carnivine']
Appended Pokemon chuck: ['suicune']


CancelledError: 

### Format training data

In [7]:
from prompt import messages_for_model, POKEMON_SYSTEM_MESSAGE


def format_csv_for_train(model, train_data_path):
    with open(train_data_path, encoding="utf8") as data_file:
        data = pd.read_csv(data_file)

    formatted_train_data = pd.DataFrame()
    formatted_train_data["text"] = data.apply(
        lambda row: messages_for_model(
            model=model,
            system_message=POKEMON_SYSTEM_MESSAGE,
            user_message=row["Input"],
            assistant_message=row["Output"],
        ),
        axis=1,
    )

    if model == "llama":
        formatted_train_data.to_csv("data/train.csv", index=False)
        print(f"Train data saved to data/train.csv")
    elif model == "gpt":
        with open("data/train.jsonl", "w", encoding="utf8") as jsonl_file:
            for messages in formatted_train_data["text"]:
                jsonl_file.write(json.dumps({"messages": messages}) + "\n")
        print(f"Train data saved to data/train.jsonl")
    else:
        raise NotImplementedError(f"Model '{model}' not supported.")


format_csv_for_train(model="gpt", train_data_path=train_save_path)
format_csv_for_train(model="llama", train_data_path=train_save_path)

Train data saved to data/train.jsonl
Train data saved to data/train.csv


### Finetune GPT3.5

In [9]:
import os

import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
file = openai.File.create(file=open("data/train.jsonl", "rb"), purpose="fine-tune")
print(file)

In [None]:
# You will probably need to wait a couple of minutes before the uploaded file is ready

openai.FineTuningJob.create(training_file=file["id"], model="gpt-3.5-turbo")

In [11]:
# You will probably need to wait around 30 minutes for the job to complete

ft_job = openai.FineTuningJob.list(limit=1)
print(ft_job)
print(ft_job["data"][0]["fine_tuned_model"])
print(ft_job["data"][0]["status"])

{
  "object": "list",
  "data": [
    {
      "object": "fine_tuning.job",
      "id": "ftjob-U01WZSLIyVY00hyku7dgyi9Z",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1692969616,
      "finished_at": 1692971409,
      "fine_tuned_model": "ft:gpt-3.5-turbo-0613:personal::7rRUMYgA",
      "organization_id": "org-YUi2D6gnQ3ezE2TdIVKbFt1e",
      "result_files": [
        "file-aVuZ9RHu25CKO66ysrQSpF3V"
      ],
      "status": "succeeded",
      "validation_file": null,
      "training_file": "file-odhZzGyN37pPgrqLaez4dfzy",
      "hyperparameters": {
        "n_epochs": 3
      },
      "trained_tokens": 881070
    }
  ],
  "has_more": false
}
ft:gpt-3.5-turbo-0613:personal::7rRUMYgA
succeeded


### Finetune Llama 2

In [13]:
# You can find already trained LoRa at huggingface tolevi/Llama-2-7b-Chat-Pokemon-GPTQ

!autotrain llm --train --model TheBloke/Llama-2-7b-Chat-GPTQ --project-name tolevi/Llama-2-7b-Chat-Pokemon-GPTQ --data-path data/ --text-column text --fp16 --use-peft --use-int4

^C
^C
