## Using LLMs for synthetic data generation

This notebook explores ways to synthetically generate data for training / finetuning other LLMs.

In [None]:
import os

import langchain
from langchain.llms import Ollama
from langchain.schema import StrOutputParser
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.pydantic_v1 import BaseModel
from langchain_community.chat_models import GigaChat
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX
from langchain_openai import ChatOpenAI

from api_keys import (HUGGINGFACEHUB_API_TOKEN, 
                      OPENAI_API_KEY, 
                      client_secret_sber, 
                      credentials_sber,
                      )

os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

## Option 1

In this specific case, GPT-3.5-turbo is used to generate SQL code. Note that due to the use of GPT-3.5-turbo, the code must either be run on Google Colab, or with a VPN that identifies the user's queries as originating in some other country than Russia. Otherwise, `ChatOpenAI()` will yield an error due to requests with Russian IP addresses being blocked by OpenAI.

In [None]:
# llama_parameters = {
#     "model": "codellama",
#     "top_p": 0.95,
#     "temperature": 0.0,
#     "repeat_penalty": 1.1,
#     "num_gpu": None,
#     "timeout": None,
#     "num_ctx": 4096,
# }
# llm_model = Ollama(**llama_parameters)

llm_model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

In [None]:
# message = """[INST] <<SYS>>You are an SQL code generator. Do not write anything except an SQL query.<</SYS>>

# Please generate one random SQL query for me.[/INST]"""

# llm_model.invoke(message)

In [None]:
class SQLCode(BaseModel):
    query: str
    answer: str


examples = [
    {"example": 
     """query: I need to select all users from database 'employees' whose 'work_category' is 'HR', and who have an annual salary higher than $50,000.,
     answer: SELECT * FROM employees WHERE work_category == 'HR'"""},
    {"example": 
     """query: How do I select all items from database 'menu' which are priced between $5 and $15?,
     answer: SELECT * FROM menu WHERE price >= 5 and price <= 15"""},
]

In [None]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [None]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=SQLCode, 
    llm=llm_model, 
    prompt=prompt_template,
)

In [None]:
synthetic_results = synthetic_data_generator.generate(
    subject="SQL_CODE",
    extra="Each query must be unique. Make up something interesting.",
    runs=10,
)

In [None]:
synthetic_results

## Option 2

work in progress...