In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False
)

  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:04<00:00,  2.05s/it]
Device set to use cuda


In [3]:
messages = [
    {
        "role":"user",
        "content": "Create a funny joke about chickens"
    }
]
output = pipe(messages)
print(output[0]["generated_text"])

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
You are not running the flash-attention implementation, expect numerical differences.


 Why did the chicken join the band? Because it had the drumsticks!


In [4]:
# Check internally converted prompt
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False)
print(prompt)

<|user|>
Create a funny joke about chickens<|end|>
<|endoftext|>


In [5]:
# Using a high temperature
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=True
)

output = pipe(messages, do_sample=True, temperature=1)
print(output[0]["generated_text"])

Device set to use cuda


 Why do chickens hate going to school? Because it’s really difficult to follow their teacher!


In [6]:
# Use a high top_p (more choices of output samples can be chosen)
output = pipe(messages, do_sample=True, top_p=1)
print(output[0]["generated_text"])

 Why don’t chickens use mobile phones? Because they prefer to “peck” at their eggs rather than touch keys!


## In-Context Learning: providing examples

In [7]:
# Goal: generate a sentence with a made-up fancy word
one_shot_prompt = [
    {
        "role": "user",
        "content": "A 'Gigamuru' is a type of Japanese musical instrument. An example of a sentence that uses the word Gigamuru is:"
    },
    {
        "role": "assistant",
        "content": "I have a Gigamuru that my uncle gave me as a gift. I love to play it at home."
    },
    {
        "role": "user",
        "content": "To 'screeg' something is to swing a sword at it. An example of a sentence that uses the word screeg is:"
    }
]
# Check which template Phi-3 converts this prompt to
print(tokenizer.apply_chat_template(one_shot_prompt, tokenize=False))

<|user|>
A 'Gigamuru' is a type of Japanese musical instrument. An example of a sentence that uses the word Gigamuru is:<|end|>
<|assistant|>
I have a Gigamuru that my uncle gave me as a gift. I love to play it at home.<|end|>
<|user|>
To 'screeg' something is to swing a sword at it. An example of a sentence that uses the word screeg is:<|end|>
<|endoftext|>


In [8]:
# Generate sentence 
outputs = pipe(one_shot_prompt)
print(outputs[0]["generated_text"])

 During the duel, the knight attempted to screeg his opponent, but he was skilled enough to dodge and counter the strikes.


## Chain prompting: breaking up the problem

In [9]:
product_prompt = [
    {"role":"user", 
     "content":"Create a name and slogan for a chatbot that leverages LLMs."}
]
outputs = pipe(product_prompt)
product_description = outputs[0]["generated_text"]
print(product_description)

 Name: Chattermind AI
Slogan: "Unleashing the power of LLMs for conversational magic."


In [10]:
# chaining to the next prompt, generate a sales pitch}
sales_prompt = [
    {"role":"user", "content":f"Generate a very short sales pitch for the following product: '{product_description}'"}
]
outputs = pipe(sales_prompt)
sales_pitch = outputs[0]["generated_text"]
print(sales_pitch)

 Transform your business with Chattermind AI! Embrace the future of communication with our "Unleashing the power of LLMs for conversational magic". Experience cutting-edge AI technology that delivers engaging, consistent, and intelligent conversations. Elevate your customer experience and unlock unlimited opportunities with Chattermind AI today!


## Chain-of-thought: think before answering

In [11]:
# Answering with the chain of thought
cot_prompt = [
    {"role":"user", "content":"Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?"},
    {"role":"assistant", "content":"Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5+6=11. The answer is 11."},
    {"role":"user", "content":"The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?"}
]
outputs = pipe(cot_prompt)
print(outputs[0]["generated_text"])

 The cafeteria had 23 apples. They used 20 to make lunch, so there were 23 - 20 = 3 apples left. Then they bought 6 more apples, which would bring the total to 3 + 6 = 9 apples. The answer is 9.


In [12]:
# Zero-shot chain-of-thought
zeroshot_cot_prompt = [
    {"role":"user", "content":"The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? Let's think step-by-step."}
]
outputs = pipe(zeroshot_cot_prompt)
print(outputs[0]["generated_text"])

 Step 1: Find out how many apples are left after using 20 apples.
The cafeteria started with 23 apples and used 20 for lunch.
23 apples - 20 apples = 3 apples left

Step 2: Add the extra 6 apples bought to the remaining apples.
The cafeteria had 3 apples left and bought 6 more.
3 apples + 6 apples = 9 apples

So, the cafeteria now has 9 apples.


## Tree-of-thought: mimic EXPERTS discussion

In [13]:
zeroshot_tot_prompt = [
    {
        "role":"user", 
        "content":"""
        Imagine three different experts are answering this question.
        All experts will write down 1 step of their thinking, then share it with the group.
        Then all experts will go on to the next step, etc. 
        If any expert realizes they're wrong at any poing then they leave.
        The question is 'The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?'
        Make sure to discuss the results.
        """}
]
outputs = pipe(zeroshot_tot_prompt)
print(outputs[0]["generated_text"])

 Expert 1:

Step 1: Start by acknowledging the initial number of apples in the cafeteria. (23 apples)

Step 2: Subtract the number of apples used for making lunch (23 - 20 = 3 apples left).

Step 3: Add the number of apples bought (3 + 6 = 9 apples).

Expert 1's final answer: The cafeteria now has 9 apples.

Expert 2:

Step 1: Identify the starting point, which is the number of apples the cafeteria had initially (23 apples).

Step 2: Subtract the number of apples used to make the lunch (23 - 20 = 3 apples).

Step 3: Add the total number of apples bought (3 + 6 = 9 apples).

Expert 2's final answer: The cafeteria now has 9 apples.

Expert 3:

Step 1: Note the beginning amount of apples in the cafeteria (23 apples).

Step 2: Subtract the 20 apples used for making lunch (23 - 20 = 3 apples).

Step 3: Add the 6 apples bought to the remaining apples (3 + 6 = 9 apples).

Expert 3's final answer: The cafeteria has 9 apples now.

Summary/Discussion:

All three experts agreed that the cafeteria

# Output verification
(there is 3rd method of FINE-TUNING, which is not covered here but later)
## Providing examples

In [14]:
# Create a character profile for a RPG game - zeroshot
zeroshot_prompt = [
    {"role":"user",
    "content":"Create a character profile for an RPG game in JSON format."}
]
outputs = pipe(zeroshot_prompt)
print(outputs[0]["generated_text"])

 ```json
{
  "name": "Eldrid the Brave",
  "age": 25,
  "species": "Human",
  "class_type": "Warrior",
  "race": "Human",
  "gender": "Male",
  "appearance": {
     "height": "180 cm",
     "weight": "80 kg",
     "age_indicator": "Young",
     "body_type": "Muscular",
     "facial_features": {
       "eyes": "Brown",
       "hair_color": "Black",
       "hair_style": "Shaven",
       "facial_hair": "Slight stubble"
     },
     "clothing": {
       "armor": "Chainmail",
       "boots": "Leather",
       "weapons": {
         "primary": "Two-handed Axe",
         "secondary": "Short sword"
       }
     }
  },
  "personality_traits": [
     "Brave",
     "Loyal",
     "Determined",
     "Fierce"
  ],
  "skills": {
     "melee_combat": 100,
     "long_range_combat": 20,
     "stealth": 50,
     "magic": 10
  },
  "background": "Eldrid was born into a noble family in the city-state of Arden. As a child, he showed a natural talent for combat and quickly rose through the ranks of the city'

In [15]:
# Create a character profile for a RPG game - oneshot
one_shot_template = """ Create a short character profile for an RPG game. Make sure to only use this format:
{
    "description": "A SHORT DESCRIPTION",
    "name": "THE CHARACTER'S NAME",
    "armor": "ONE PIECE OF ARMOR",
    "weapon": "ONE OR MORE WEAPONS"
}
"""
one_shot_prompt = [
    {"role":"user","content":one_shot_template}
]
outputs = pipe(one_shot_prompt)
print(outputs[0]["generated_text"])


 ```

{

    "description": "A sagacious and battle-hardened druid, deeply connected to the mystical forces of nature, bound by a sacred oath to protect the ancient woods.",

    "name": "Aranel Windwhisper",

    "armor": "Sylvan Cloak",

    "weapon": "Yew Bow, Silver Arrow, Nature's Whisper Staff"

}

```


## Grammar: constrained sampling

In [16]:
# Cleaning VRAM off the old model first
import gc, torch
del model, tokenizer, pipe
gc.collect()
torch.cuda.empty_cache()

In [19]:
from llama_cpp.llama import Llama
llm = Llama.from_pretrained(
    repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
    filename="*fp16.gguf",
    n_gpu_layers=-1, # all layers of model will be on GPU
    n_ctx=2048, # context size
    verbose=False
)

llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


In [20]:
# ask model to create RPG character in JSON format
output = llm.create_chat_completion(
    messages=[
        {"role":"user", "content":"Create a warrior for an RPG in JSON format."}
    ],
    response_format={"type":"json_object"},
    temperature=0
)["choices"][0]["message"]["content"]

# to check if output is indeed a json => convert <json> to <string> and reconvert to <json> to check
import json
json_output = json.dumps(json.loads(output), indent=4)
print(json_output)

{
    "warrior": {
        "name": "Eldric Stormbringer",
        "class": "Warrior",
        "level": 5,
        "attributes": {
            "strength": 18,
            "dexterity": 10,
            "constitution": 16,
            "intelligence": 8,
            "wisdom": 10,
            "charisma": 12
        },
        "skills": [
            {
                "name": "Martial Arts",
                "proficiency": 20,
                "description": "Expert in hand-to-hand combat and weapon handling."
            },
            {
                "name": "Shield Block",
                "proficiency": 18,
                "description": "Highly skilled at deflecting attacks with a shield."
            },
            {
                "name": "Heavy Armor",
                "proficiency": 16,
                "description": "Expertly equipped with heavy armor for protection."
            },
            {
                "name": "Survival",
                "proficiency": 14,
                "