<a href="https://colab.research.google.com/github/SARA3SAEED/abu-LLM/blob/main/1_abu_part_02_output_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install 'litellm[proxy]'==1.44.9 openai==1.42.0

In [None]:
import os
from google.colab import userdata
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')
os.environ['OPENAI_API_KEY'] = userdata.get('openai-colab')

# ============ kill any litellm processes
# !pkill -f litellm

In [None]:
%%writefile llm.config
model_list:

  - model_name: "groq-gemma9b"
    litellm_params:
      model: "groq/gemma2-9b-it"
      api_key: "os.environ/GROQ_API_KEY"

  - model_name: "groq-mixtral"
    litellm_params:
      model: "groq/mixtral-8x7b-32768"
      api_key: "os.environ/GROQ_API_KEY"

  - model_name: "openai-gpt4o-mini"
    litellm_params:
      model: "openai/gpt-4o-mini"
      api_key: "os.environ/OPENAI_API_KEY"

In [None]:
!nohup litellm --config llm.config &
!sleep 15 && tail nohup.out

### Set LiteLLM Logs

In [None]:
import litellm
from litellm.integrations.custom_logger import CustomLogger
from litellm import completion, acompletion
import os
import json

logs_dir = "./llm-logs"
os.makedirs(logs_dir, exist_ok=True)


def log_post_api_call(  kwargs,                 # kwargs to completion
                        completion_response,    # response from completion
                        start_time, end_time    # start/end time
                        ):
    with open(os.path.join(logs_dir, "post-llm-call.jsonl"), "a") as dest:
        dest.write(json.dumps({
            "kwargs": kwargs,
            "completion_response": completion_response,
            "start_time": start_time,
            "end_time": end_time,
        }, default=str, ensure_ascii=False) + "\n" )

litellm.success_callback = [log_post_api_call]

## Output Parser - Basics

##### Example 1

In [None]:
import openai
import json
from pprint import pprint
from pydantic import BaseModel, Field
from litellm import completion, acompletion
from typing import List

def parse_json(text):
    text = text.replace("```json", "").replace("```", "").strip()
    try:
        return json.loads(text)
    except:
        return None

class PersonalDetails(BaseModel):
    name: str= Field(..., description="The name of the person")
    age: int=Field(..., description="The age of the person")
    gender: str=Field(..., description="The gender of the person")
    nationality: str=Field(..., description="The nationality of the person")
    graduated_in: str=Field(..., description="The university or college that the person graduated in")
    occupation: str=Field(..., description="The occupation of the person")
    interests: str=Field(..., description="The interests of the person")
    similiar_figures_names: List[str] = Field(..., description="The similiar figures names of the person")

# client = openai.OpenAI(
#     api_key="anything",
#     base_url="http://0.0.0.0:4000"
# )

# response = client.chat.completions.create(model="groq-gemma9b", messages = [])

In [None]:
# request sent to model set on litellm proxy, `litellm --model`
response = completion(model="groq/gemma2-9b-it", messages = [
    {
        "role": "system",
        "content": "\n".join([
            "You are an NLP data paraser. You have to parse natural text to specific data scheme.",
            "You will be provided by a text and a pydantic scheme.",
            "You have to generate a json object that matches the pydantic scheme, and filled with extracted data from text",
            "Do not translate values. Fill the values as they are in the text.",
            "Do not generate any introduction or conclusions. Just generate the JSON output.",
        ]),
    },
    {
        "role": "user",
        "content":"\n".join([
            "### Input Text:",
            """ علي مصطفى مشرفة باشا (11 يوليو 1898 – 15 يناير 1950) هو عالم فيزياء نظرية مصري. من مواليد دمياط. يُلقّب بأينشتاين العرب لأن أبحاثه كانت في نفس المجال ونفس الموضوعات التي كانت أبحاث ألبرت أينشتاين تدور حولها. تخرج في مدرسة المعلمين العليا عام 1917، وحصل على دكتوراه فلسفة العلوم من جامعة لندن عام 1923، ثم كان أول مصري يحصل على درجة دكتوراه العلوم من إنجلترا من جامعة لندن عام 1924. عُيّن أستاذاً للرياضيات في مدرسة المعلمين العليا ثم للـرياضيات التطبيقية في كلية العلوم عام 1926. مُنح لقب أستاذ من جامعة القاهرة وهو دون الثلاثين من عمره. انتُخب في عام 1936 عميدا لكلية العلوم، فأصبح بذلك أول عميد مصري لها. حصل على لقب الباشاوية من الملك فاروق. تتلمذ على يده مجموعة من أشهر علماء مصر، ومن بينهم سميرة موسى.""",
            "",
            "### PyDantic Scheme:",
            PersonalDetails.schema_json(),
            "",
            "### Output JSON:",
            "```json"
        ])
    }
])

if response and response.choices:
    print("model:", response.model)
    resp_text = response.choices[0].message.content
    pprint(parse_json(resp_text))

model: groq/gemma2-9b-it
{'age': None,
 'gender': 'male',
 'graduated_in': 'University of London',
 'interests': 'Physics',
 'name': 'علي مصطفى مشرفة باشا',
 'nationality': 'Egyptian',
 'occupation': 'Professor of Mathematics & Applied Mathematics',
 'similiar_figures_names': ['ألبرت أينشتاين']}


##### Example 2

In [None]:
import openai
import json
from pprint import pprint
from litellm import completion, acompletion
from pydantic import BaseModel, Field
from typing import List
from enum import Enum

def parse_json(text):
    text = text.replace("```json", "").replace("```", "").strip()
    try:
        return json.loads(text)
    except:
        return None

# Enum for messsage intents
class MessageIntent(str, Enum):
    say_greeting = "say_greeting"
    ask_weather_status = "ask_weather_status"
    ask_light_off = "ask_light_off"
    ask_light_on = "ask_light_on"
    ask_light_status = "ask_light_status"
    ask_not_specified = "ask_not_specified"

class MessageSentiment(str, Enum):
    positive_sentiment = "positive_sentiment"
    negative_sentiment = "negative_sentiment"
    no_detected_sentiment = "no_detected_sentiment"

class Message(BaseModel):
    intents: List[MessageIntent] = Field(..., description="The set of detected intents of the message", min_items=1, max_items=3)
    sentiment: MessageSentiment = Field(..., description="The sentiment of the message")


In [None]:
response = completion(model="groq/gemma2-9b-it", messages = [
    {
        "role": "system",
        "content": "\n".join([
            "You are an NLP data paraser. You have to parse natural text to specific data scheme.",
            "You will be provided by a text and a pydantic scheme.",
            "You have to generate a json object that matches the pydantic scheme, and filled with extracted data from text",
            "Do not translate values. Fill the values as they are in the text.",
            "Do not generate any introduction or conclusions. Just generate the JSON output.",
        ]),
    },
    {
        "role": "user",
        "content":"\n".join([
            "### Input Text:",
            "كم درجة الحرارة خارج الغرفة حاليا ؟ و يا ريت تضوي نور الغرفة",
            # "ولعلي نور الصالة. فاهم يا غبي, الصالة مش الأوضة"
            "",
            "### PyDantic Scheme:",
            Message.schema_json(),
            "",
            "### Output JSON:",
            "```json"
        ])
    }
])

if response and response.choices:
    print("model:", response.model)
    resp_text = response.choices[0].message.content
    resp_json = parse_json(resp_text)
    if resp_json:
        filled_scheme = Message(**resp_json)
        pprint(filled_scheme.dict())

In [None]:
filled_scheme.intents[0].value

'ask_weather_status'

### Output Parser using LangChain

In [None]:
!pip install langchain==0.1.20

In [None]:
from langchain_community.chat_models import ChatLiteLLM
from langchain_core.messages import HumanMessage

In [None]:
chat = ChatLiteLLM(model="groq/gemma2-9b-it", temperature=0.5)

In [None]:
# let's test the chat module
messages = [
    HumanMessage(
        content="كيف يسطع القمر بالنور ؟"
    )
]
chat(messages)

In [None]:
import json
from pprint import pprint
from litellm import completion, acompletion
from pydantic import BaseModel, Field
from typing import List
from enum import Enum

# Enum for messsage intents
class MessageIntent(str, Enum):
    say_greeting = "say_greeting"
    ask_weather_status = "ask_weather_status"
    ask_light_off = "ask_light_off"
    ask_light_on = "ask_light_on"
    ask_light_status = "ask_light_status"
    ask_not_specified = "ask_not_specified"

class MessageSentiment(str, Enum):
    message_with_positive_sentiment = "message_with_positive_sentiment"
    message_with_negative_sentiment = "message_with_negative_sentiment"
    message_with_neutral_sentiment = "message_with_neutral_sentiment"

class Message(BaseModel):
    intents: List[MessageIntent] = Field(..., description="The set of detected intents of the message", min_items=1, max_items=3)
    sentiment: MessageSentiment = Field(..., description="The sentiment of the message")

In [None]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator

In [None]:
# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Message)

prompt = PromptTemplate(
    template="أجب عن سؤال المستخدم.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | chat | parser

chain.invoke({"query":  "كم درجة الحرارة خارج الغرفة حاليا ؟ و يا ريت تضوي نور الغرفة",})

Message(intents=[<MessageIntent.ask_light_on: 'ask_light_on'>, <MessageIntent.ask_weather_status: 'ask_weather_status'>], sentiment=<MessageSentiment.message_with_positive_sentiment: 'message_with_positive_sentiment'>)

## Synthetic Data Generation

#### Example 1

In [None]:
import json
from pprint import pprint
from litellm import completion, acompletion
from pydantic import BaseModel, Field
from typing import List
from enum import Enum

# class Receipe(BaseModel):
#     receipe_name: str = Field(..., description="The name of the receipe")
#     ingredients: List[str] = Field(..., description="The ingredients of the receipe", min_items=1, max_items=20)
#     cooking_steps: List[str] = Field(..., description="The cooking steps of the receipe", min_items=1, max_items=20)
#     cook_time: int = Field(..., description="The cooking time of the receipe in minutes")
#     serving_instructions : List[str] = Field(..., description="The serving instructions of the receipe", min_items=1, max_items=20)

class Receipe(BaseModel):
    receipe_name: str = Field(..., description="اسم الوصفة")
    ingredients: List[str] = Field(..., description="مكونات الوصفة", min_items=1, max_items=10)
    cooking_steps: List[str] = Field(..., description="خطوات طهو الوصفة.", min_items=1, max_items=10)
    cook_time: int = Field(..., description="عدد الدقائق اللازمة لطهو الوصفة.")
    serving_instructions : List[str] = Field(..., description="إرشادات لتقديم الوصفة بعد الطهو.", min_items=1, max_items=5)


In [None]:
response = completion(model="groq/gemma2-9b-it", messages = [
    {
        "role": "system",
        "content": "\n".join([
            "You are an NLP data generator. You have to generate a popular reciepe in Middle East.",
            "You will be provided by a receipe name, you have to generate the receipe details in Arabic.",
            "The receipe details must be in Arabic. Avoid English terms."
            "You have to generate a json object that matches the pydantic scheme, and filled with extracted data from text",
            "Do not generate any introduction or conclusions. Just generate the JSON output.",
        ]),
    },
    {
        "role": "user",
        "content":"\n".join([
            "### Receipe Name:",
            "دولما عراقي",
            "",
            "### PyDantic Scheme:",
            Receipe.schema_json(),
            "",
            "### Receipe Details in JSON:",
            "```json"
        ])
    }
], temperature=0.5, max_tokens=1024)

if response and response.choices:
    print("model:", response.model)
    resp_text = response.choices[0].message.content
    resp_json = parse_json(resp_text)
    if resp_json:
        filled_scheme = Receipe(**resp_json)
        pprint(filled_scheme.dict())

#### Example 2

In [None]:
class Receipe(BaseModel):
    receipe_name: str = Field(..., description="The name of the receipe")
    ingredients: List[str] = Field(..., description="The ingredients of the receipe", min_items=1, max_items=20)
    cooking_steps: List[str] = Field(..., description="The cooking steps of the receipe", min_items=1, max_items=20)
    cook_time: int = Field(..., description="The cooking time of the receipe in minutes")
    serving_instructions : List[str] = Field(..., description="The serving instructions of the receipe", min_items=1, max_items=20)

class GeneratedPrompt(BaseModel):
    system_message: str = Field(..., description="The system message to guide the language model about its rule.")
    user_message: str = Field(..., description="The user message to ask the language model for the receipe")
    generated_receipe: Receipe = Field(..., description="The generated receipe")

In [None]:
response = completion(model="groq/gemma2-9b-it", messages = [
    {
        "role": "system",
        "content": "\n".join([
            "You are an NLP data generator. You have to generate training data to finetune a language model on generating popular reciepes in Middle East.",
            "You will be provided by a receipe name, you have to generate the full training sample including system message, user message, and the generated model response for the receipe details in Arabic.",
            "You have to generate a json object that matches the pydantic scheme, and filled with extracted data from text",
            "Do not generate any introduction or conclusions. Just generate the JSON output.",
        ]),
    },
    {
        "role": "user",
        "content":"\n".join([
            "### Receipe Name:",
            "دولما عراقي",
            "",
            "### PyDantic Scheme:",
            GeneratedPrompt.schema_json(),
            "",
            "### LLM Training Sample:",
            "```json"
        ])
    }
], temperature=0.5, max_tokens=1024)

if response and response.choices:
    print("model:", response.model)
    resp_text = response.choices[0].message.content
    resp_json = parse_json(resp_text)
    if resp_json:
        filled_scheme = GeneratedPrompt(**resp_json)
        pprint(filled_scheme.dict())

# **DIY**

### Task:

Develop an LLM Assistant to generate valid data to finetune an LLM on a specific task.

Choose one of these tasks:

    - Assisting a customer support center in detecting the sentiment of user messages and identifying which products are mentioned.

    - Creating training data for an LLM that can identify and correct grammatical errors in sentences.

    - Developing a dataset for training an LLM to recognize and extract entities from medical texts, including diseases, medications, and symptoms.
    
    - Generating a dataset that identifies the user's dialect in a restaurant and responds in the same dialect.

### Guides

- Setup LiteLLM Proxy
- Keep logs into a JSONL file
- Setup the Pydantic Models
- Setup the Prompt Template
- Save generated data into a JSONL file
- The generated data must follow this [popular format](https://github.com/hiyouga/LLaMA-Factory/blob/main/data/alpaca_en_demo.json)
- Generate at least 50 samples.
