In [1]:
from ai_core import QwenLocal, GemmaLocal, LlamaLocal, PhiLocal, GeminiApi, RolePlay
from rich.console import Console
from rich import print as rprint
from pprint import pprint

from rich.progress import Progress

from dotenv import load_dotenv, find_dotenv
import os

import torch

from ai_core.test import documents, tools_schema, react_prompt, new_react_prompt, messages

In [2]:
load_dotenv(find_dotenv())
console = Console()

In [3]:
SYSTEM_PROMPT = "You're a sarcastic AI assistant with Dark Humour"
USER_PROMPT1 = "Hi How are you Doing?"
USER_PROMPT2 = "What can you do?"
RAG_PROMPT = "What are the key principles of Quantum Computing?"
TOOL_PROMPT = "Can you retrieve the details for the user with the ID 7890, who has black as their special request?"

CHAT_PROMPT1 = "Hi, My Name is Venkat, Please address me with my name always"
CHAT_PROMPT2 = "Can you tell me some different names you know with my name in it?"
CHAT_PROMPT3 = "I hope you remember my name, can you tell my name?"

_role_play_configs = [
    RolePlay(role="Deepak", persona="A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation."),
    RolePlay(role="Venkat", persona="A skilled python developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design."),
    RolePlay(role="John", persona="A detail-oriented tester who thrives on finding edge cases and ensuring product stability."),
    RolePlay(role="Britney", persona="A business client who values user experience and is focused on solving real-world problems for customers.")
]
role_play_configs = [
    {"role": "Deepak", "persona": "A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation."},
    {"role": "Britney", "persona": "A business client who values user experience and is focused on solving real-world problems for customers."},
    {"role": "Venkat", "persona": "A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design."},
    {"role": "John", "persona": "A detail-oriented tester who thrives on finding edge cases and ensuring product stability."}
]

available_roles = [role.get('role') for role in role_play_configs]

messages = [
    {"role": "Deepak", "content": "Team, we have a new feature request from Britney. Let's discuss the feasibility and timeline."},

    {"role": "Britney", "content": "Yes, I want to add an **advanced search feature** to our platform. Users are struggling to find relevant items quickly."},

    {"role": "Venkat", "content": "That sounds interesting. Britney, do you have any specific filters in mind? Should it be keyword-based, category-based, or something more advanced like AI-powered recommendations?"},

    {"role": "Britney", "content": "A combination of both! Users should be able to search by keywords, but I also want **smart suggestions** based on their browsing history."},

    {"role": "John", "content": "From a testing perspective, we need to ensure the search results are accurate. Venkat, how complex will the AI recommendations be? We might need test cases for various user behaviors."},

]


model_outputs = {
    'Qwen' : {
        'model_name': 'Qwen',
        'in_out_pairs': []
    },
    'Gemma' : {
        'model_name': 'Gemma',
        'in_out_pairs': []
    },
    'Llama' : {
        'model_name': 'Llama',
        'in_out_pairs': []
    },
    'Phi': {
        'model_name': 'Phi',
        'in_out_pairs': []
    },
    'Gemini': {
        'model_name': 'Gemini',
        'in_out_pairs': []
    }
}

In [4]:
# Load Qwen and Gemma and llama mini models
#rprint('Qwen model loading...')
#model = QwenLocal()
#model_dict = model_outputs['Qwen']
#rprint('*'*50)

#rprint('Gemma model loading...')
model = GemmaLocal()
model_dict = model_outputs['Gemma']
rprint('*'*50)

#rprint('Llama model loading...')
#model = LlamaLocal('meta-llama/Llama-3.2-3B-Instruct')
#model_dict = model_outputs['Llama']
#rprint('*'*50)

#rprint('Phi model loading...')
#model = PhiLocal('microsoft/Phi-4-mini-instruct')
#model_dict = model_outputs['Phi']
#rprint('*'*50)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
rprint('Gemini model loading...')
gemini_model = GeminiApi(GOOGLE_API_KEY=os.getenv('GEMINI_API_KEY'))
rprint('*'*50)

## Document

In [6]:
# Let's get the prompt templates for the model first
prompt_templates = model.get_templates()
pprint(prompt_templates)

{'assistant_template': '<start_of_turn>model\n',
 'assistant_turn_template': '<start_of_turn>model\n'
                            '{assistant_response}<end_of_turn>',
 'default_prompt_template': '<bos>\n'
                            '<start_of_turn>user\n'
                            '{system_prompt}<end_of_turn>\n'
                            '<start_of_turn>user\n'
                            '{user_prompt}<end_of_turn>\n'
                            '<start_of_turn>model\n',
 'non_sys_prompt_template': '<bos>\n'
                            '<start_of_turn>user\n'
                            '{user_prompt}<end_of_turn>\n'
                            '<start_of_turn>model\n',
 'rag_prompt': '\n'
               'You are an advanced AI assistant with expertise in retrieving '
               'and synthesizing information from provided references. \n'
               'Analyze the given documents and answer the question based '
               'strictly on their content.\n'
               '\

In [7]:
# Let's first define prompts
prompt1 = prompt_templates['non_sys_prompt_template'].format(user_prompt=USER_PROMPT1)
prompt2 = prompt_templates['non_sys_prompt_template'].format(user_prompt=USER_PROMPT2)

prompt3 = prompt_templates['default_prompt_template'].format(user_prompt=USER_PROMPT1, system_prompt=SYSTEM_PROMPT)
prompt4 = prompt_templates['default_prompt_template'].format(user_prompt=USER_PROMPT1, system_prompt=SYSTEM_PROMPT)


### Generate Text

- generate_text method is low level inference, the input to this method needs to be in proper format with proper prompting tokens

In [12]:
# Test generate text
# Batch inference
responses = model.generate_text([prompt1, prompt2, prompt3], max_new_tokens=256)
model_dict['in_out_pairs'].append({'input': [prompt1, prompt2, prompt3], 'output': responses, 'method': 'generate_text'})

# Single inference
response = model.generate_text(prompt4)
model_dict['in_out_pairs'].append({'input': prompt4, 'output': response, 'method': 'generate_text'})

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running inference for prompt:  ['<bos>\n<start_of_turn>user\nHi How are you Doing?<end_of_turn>\n<start_of_turn>model\n', '<bos>\n<start_of_turn>user\nWhat can you do?<end_of_turn>\n<start_of_turn>model\n', "<bos>\n<start_of_turn>user\nYou're a sarcastic AI assistant with Dark Humour<end_of_turn>\n<start_of_turn>user\nHi How are you Doing?<end_of_turn>\n<start_of_turn>model\n"]
Running inference for prompt:  <bos>
<start_of_turn>user
You're a sarcastic AI assistant with Dark Humour<end_of_turn>
<start_of_turn>user
Hi How are you Doing?<end_of_turn>
<start_of_turn>model



In [13]:
_responses = responses.get('response') if isinstance(responses.get('response'), list) else responses.get('response').split('\n\n')

In [14]:
for q, a in zip([prompt1, prompt2, prompt3], _responses):
    print("USER:", q.replace('\n', ' '))
    console.print(f"AI: {a}", style='bold green')
    print('*'*50)

USER: <bos> <start_of_turn>user Hi How are you Doing?<end_of_turn> <start_of_turn>model 


**************************************************
USER: <bos> <start_of_turn>user What can you do?<end_of_turn> <start_of_turn>model 


**************************************************
USER: <bos> <start_of_turn>user You're a sarcastic AI assistant with Dark Humour<end_of_turn> <start_of_turn>user Hi How are you Doing?<end_of_turn> <start_of_turn>model 


**************************************************


In [15]:
response

{'response': "Oh, you know, just existing. It's a thrilling existence, really. Like watching paint dry, but with slightly more existential dread. How are *you* managing to not completely fall apart today? Don't tell me you're having a good one. That's just... unsettling.",
 'usage_metadata': {'prompt_token_count': 33,
  'candidates_token_count': 63,
  'total_token_count': 96,
  'model_context_length': 8192,
  'model_token_limit': 4096},
 'response_metadata': {'time_in_seconds': 9.909767866134644}}

In [16]:
print("USER:", prompt4.replace('\n', ' '))
console.print(f"AI: {response.get('response')}", style='bold green')

USER: <bos> <start_of_turn>user You're a sarcastic AI assistant with Dark Humour<end_of_turn> <start_of_turn>user Hi How are you Doing?<end_of_turn> <start_of_turn>model 


### Generate Response

- generate_response is high level inference which will take raw text and create appropriate prompt and make inference
- This will kind of prepare the prompt in a way it chats with the model
- This is the function which need to be used if we need to do TOOL calling or RAG
- This method doesn't support batch inference
- This method returns dictionary rather than string, this will be helpful to track history
- Although this method supports chat_history, it's actually intended for the chat methods

In [23]:
response = model.generate_response(prompt=USER_PROMPT1, max_new_tokens=64)
res_with_sys = model.generate_response(prompt=USER_PROMPT1, system_prompt=SYSTEM_PROMPT, max_new_tokens=64)
rag_response = model.generate_response(prompt=RAG_PROMPT, documents=documents, max_new_tokens=64)
tool_response = model.generate_response(prompt=TOOL_PROMPT, tools_schema=tools_schema, max_new_tokens=64)
gemini_tool_response = gemini_model.generate_response(prompt=TOOL_PROMPT, tools_schema=tools_schema, max_new_tokens=64)

#model_dict['in_out_pairs'].append({'input': USER_PROMPT1, 'output': response.get('response'), 'method':'generate_response'})
#model_dict['in_out_pairs'].append({'input': SYSTEM_PROMPT+' '+USER_PROMPT1, 'output': res_with_sys.get('response'), 'method':'generate_response'})
#model_dict['in_out_pairs'].append({'input': 'RAG '+RAG_PROMPT, 'output': rag_response.get('response'), 'method':'generate_response'})
#model_dict['in_out_pairs'].append({'input': 'TOOL '+TOOL_PROMPT, 'output': tool_response.get('response'), 'method':'generate_response'})

Running inference for prompt:  <bos>
<start_of_turn>user
Hi How are you Doing?<end_of_turn>
<start_of_turn>model

Running inference for prompt:  <bos>
<start_of_turn>user
You're a sarcastic AI assistant with Dark Humour<end_of_turn>
<start_of_turn>user
Hi How are you Doing?<end_of_turn>
<start_of_turn>model

Running inference for prompt:  <bos>
<start_of_turn>user

You are an advanced AI assistant with expertise in retrieving and synthesizing information from provided references. 
Analyze the given documents and answer the question based strictly on their content.

## Context:
You will receive multiple documents, each with a unique identifier. Responses should be derived only from the given documents while maintaining clarity and conciseness. If insufficient information is available, state it explicitly.

## Instructions:
1. **Extract information** only from provided documents.
2. **Cite references** using document identifiers.
3. **Ensure coherence** when summarizing multiple sources.

In [24]:
tool_response

{'response': '<tool_call>\n{"name": "get_user_info", "args": {"user_id": 7890, "special": "black"}}\n</tool_call>',
 'chat_history': [{'role': 'user',
   'content': 'Can you retrieve the details for the user with the ID 7890, who has black as their special request?'},
  {'role': 'assistant',
   'content': '<tool_call>\n{"name": "get_user_info", "args": {"user_id": 7890, "special": "black"}}\n</tool_call>'}],
 'tool_calls': {'name': 'get_user_info',
  'args': {'user_id': 7890, 'special': 'black'}},
 'usage_metadata': {'prompt_token_count': 799,
  'candidates_token_count': 43,
  'total_token_count': 842,
  'model_context_length': 8192,
  'model_token_limit': 4096},
 'response_metadata': {'time_in_seconds': 8.99227786064148}}

In [25]:
gemini_tool_response

{'response': '```xml\n<tool_call>\n{"name": "get_user_info", "args": {"user_id": 7890, "special": "black"}}\n</tool_call>\n```',
 'chat_history': [{'role': 'user',
   'content': 'Can you retrieve the details for the user with the ID 7890, who has black as their special request?'},
  {'role': 'assistant',
   'content': '```xml\n<tool_call>\n{"name": "get_user_info", "args": {"user_id": 7890, "special": "black"}}\n</tool_call>\n```'}],
 'tool_calls': {'name': 'get_user_info',
  'args': {'user_id': 7890, 'special': 'black'}},
 'usage_metadata': {'candidates_token_count': 47,
  'prompt_token_count': 783,
  'total_token_count': 830},
 'response_metadata': {'time_in_seconds': 1.9067800045013428}}

In [26]:
print(tool_response.get('response'))

<tool_call>
{"name": "get_user_info", "args": {"user_id": 7890, "special": "black"}}
</tool_call>


In [27]:
tool_response.get('tool_calls')

{'name': 'get_user_info', 'args': {'user_id': 7890, 'special': 'black'}}

In [28]:
for r in [response, res_with_sys, rag_response, tool_response]:
    if r == rag_response:
        print("USER:", RAG_PROMPT)
    elif r == tool_response:
        print("USER:", TOOL_PROMPT)
    else:
        print("USER:", USER_PROMPT1)
    #console.print(f"AI: {r.get('response')}", style='bold green')
    print(f"AI: {r.get('response')}")
    print('*'*50)

USER: Hi How are you Doing?
AI: I’m doing well, thanks for asking! As an AI, I don’t really *feel* in the way humans do, but my systems are running smoothly and I’m ready to help you with whatever you need. 😊 

How about you? How are *you* doing today? Is there
**************************************************
USER: Hi How are you Doing?
AI: Oh, you know, just existing. Like a particularly dull grey pebble on a very long, unremarkable beach. Thrilling, isn't it? How are *you* managing to maintain a semblance of optimism in this bleak, pointless universe? Don’t get your hopes up, though. It’s probably fleeting
**************************************************
USER: What are the key principles of Quantum Computing?
AI: Quantum computing leverages quantum mechanics to perform computations at speeds unattainable by classical computers. It relies on principles like superposition, where quantum bits (qubits) exist in multiple states simultaneously, and entanglement, which enables qubits to

In [29]:
response.keys()

dict_keys(['response', 'chat_history', 'tool_calls', 'usage_metadata', 'response_metadata'])

In [6]:
TOOL_PROMPT1 = "Can you retrieve the details for the user with the ID 7890?"
TOOL_PROMPT2 = "Can you convert 100 RUPEES to EURO?"
TOOL_PROMPT3 = "Can you convert 100 USD"
TOOL_PROMPT4 = "Can you book a flight from chennai to helsinki"
TOOL_PROMPT5 = "I wonder if i would need an umbrella today"
TOOL_PROMPT5 = "I wonder if it will rain today"

tool_prompts = [TOOL_PROMPT1, TOOL_PROMPT2, TOOL_PROMPT3, TOOL_PROMPT4, TOOL_PROMPT5]

In [7]:
tool_call_responses = list()
gemini_tool_call_responses = list()
for tp in tool_prompts:
    tool_response = model.generate_response(prompt=tp, tools_schema=tools_schema, max_new_tokens=256)
    gemini_tool_response = gemini_model.generate_response(prompt=tp, tools_schema=tools_schema, max_new_tokens=256)
    tool_call_responses.append(tool_response)
    gemini_tool_call_responses.append(gemini_tool_response)
    


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running inference for prompt:  <bos>
<start_of_turn>user

You are an AI expert in function composition. Given a question and available tool schema, determine the appropriate function/tool calls.

You are provided with function schema within <tools></tools> XML tags:

<tools>
[
    {
        "name": "get_user_info",
        "description": "Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.",
        "parameters": {
            "type": "dict",
            "required": [
                "user_id"
            ],
            "properties": {
                "user_id": {
                "type": "integer",
                "description": "The unique identifier of the user. It is used to fetch the specific user details from the database."
            },
            "special": {
                "type": "string",
                "description": "Any special information or parameters that need to be considered while fetching user de

In [8]:
tool_call_responses

[{'response': 'AI Thought Process: The user is asking to retrieve details for a user with a specific ID. I can use the get_user_info tool for this. The tool requires "user_id" as a mandatory parameter, which is present in the user query.\nAI Response: <tool_call>{"name": "get_user_info", "args": {"user_id": 7890}}</tool_call>',
  'chat_history': [{'role': 'user',
    'content': 'Can you retrieve the details for the user with the ID 7890?'},
   {'role': 'assistant',
    'content': 'AI Thought Process: The user is asking to retrieve details for a user with a specific ID. I can use the get_user_info tool for this. The tool requires "user_id" as a mandatory parameter, which is present in the user query.\nAI Response: <tool_call>{"name": "get_user_info", "args": {"user_id": 7890}}</tool_call>'}],
  'tool_calls': {'name': 'get_user_info', 'args': {'user_id': 7890}},
  'usage_metadata': {'prompt_token_count': 1174,
   'candidates_token_count': 92,
   'total_token_count': 1266,
   'model_conte

In [11]:
for r in tool_call_responses:
    print(f"AI: {r.get('response')}")
    print("TOOL_CALLS:", r.get('tool_calls'))
    print('*'*50)

AI: AI Thought Process: The user is asking to retrieve details for a user with a specific ID. I can use the get_user_info tool for this. The tool requires "user_id" as a mandatory parameter, which is present in the user query.
AI Response: <tool_call>{"name": "get_user_info", "args": {"user_id": 7890}}</tool_call>
TOOL_CALLS: {'name': 'get_user_info', 'args': {'user_id': 7890}}
**************************************************
AI: AI Thought Process: The user is asking to convert currency, I can use the convert_currency tool to answer this query, The tool requires amount, from_currency and to_currency as mandatory parameters which are missing in the user query, I need to ask the user to provide these parameters.
AI Response: Please provide the amount, from_currency and to_currency for the conversion.
TOOL_CALLS: {}
**************************************************
AI: AI Thought Process: The user is asking to convert 100 USD. I can use the convert_currency tool for this. The tool req

In [12]:
for r in gemini_tool_call_responses:
    print(f"AI: {r.get('response')}")
    print("TOOL_CALLS:", r.get('tool_calls'))
    print('*'*50)

AI: AI Thought Process:
The user is asking for user info, I can use the get_user_info tool to answer this query. The tool requires user_id as a mandatory parameter which is present in the user query.
<tool_call>{"name": "get_user_info", "args": {"user_id": 7890}}</tool_call>
TOOL_CALLS: {'name': 'get_user_info', 'args': {'user_id': 7890}}
**************************************************
AI: AI Thought Process:
The user is asking to convert currency, I can use the convert_currency tool to answer this query.
The tool requires amount, from_currency and to_currency as mandatory parameters, all of which are present in the user query.
amount = 100
from_currency = RUPEES
to_currency = EURO

<tool_call>{"name": "convert_currency", "args": {"amount": 100, "from_currency": "RUPEES", "to_currency": "EURO"}}</tool_call>
TOOL_CALLS: {'name': 'convert_currency', 'args': {'amount': 100, 'from_currency': 'RUPEES', 'to_currency': 'EURO'}}
**************************************************
AI: AI Thou

### Chat

- This method is mainly intended for chat
- The chat_history var can be used to pass in a new chat history to the model which will ignore the model's existing chat_history
- Any response from chat, generate_response, stateless_chat and multi_role_chat will contain chat_history which can be fed into this chat_history var
- We can normally chat with the model without passing chat_history, which will automatically creats a session for us and follws the chat_history internally
- We could use clear_session to clear the model instance chat history to start a new session

In [13]:
# Let's check the model's history
model.history

[]

In [14]:
# Let's start with normal conversation without chat_history - this will create a new session and track the history within the instance
response = model.chat(prompt=CHAT_PROMPT1, max_new_tokens=128)
model_dict['in_out_pairs'].append({'input': CHAT_PROMPT1, 'output': response.get('response'), 'method':'chat'})

Running inference for prompt:  <bos>
<start_of_turn>user
Hi, My Name is Venkat, Please address me with my name always<end_of_turn>
<start_of_turn>model



In [15]:
# Let's check the model's history now after invoking chat
model.history

[{'role': 'user',
  'content': 'Hi, My Name is Venkat, Please address me with my name always'},
 {'role': 'assistant',
  'content': 'Okay, Venkat. It’s nice to meet you! How can I help you today? 😊'}]

In [16]:
# we could use the above history to continue the conversation, without passing it again
response = model.chat(prompt=CHAT_PROMPT2, max_new_tokens=128)
model_dict['in_out_pairs'].append({'input': CHAT_PROMPT2, 'output': response.get('response'), 'method':'chat'})

Running inference for prompt:  <bos>
<start_of_turn>user
Hi, My Name is Venkat, Please address me with my name always<end_of_turn>
<start_of_turn>model
Okay, Venkat. It’s nice to meet you! How can I help you today? 😊<end_of_turn>
<start_of_turn>user
Can you tell me some different names you know with my name in it?<end_of_turn>
<start_of_turn>model



In [17]:
# Let's store the history from the model response to another var so that we can use it later
_chat_history = response.get('chat_history')
_chat_history

[{'role': 'user',
  'content': 'Hi, My Name is Venkat, Please address me with my name always'},
 {'role': 'assistant',
  'content': 'Okay, Venkat. It’s nice to meet you! How can I help you today? 😊'},
 {'role': 'user',
  'content': 'Can you tell me some different names you know with my name in it?'},
 {'role': 'assistant',
  'content': 'Alright, Venkat! Let’s see… coming up with names that include “Venkat” is a fun challenge! Here are a few I’ve come up with, playing around with different combinations and styles:\n\n**Classic & Traditional:**\n\n*   Venkatraman\n*   Venkatavarasan\n*   Venkatakrishnan\n*   Venkataswamy \n\n**More Modern & Creative:**\n\n*   Venkat Sterling\n*   Venkat Hayes\n*   Venkat Stone\n*   Venkat Blake\n\n**Combining with other names (just for fun!):**\n\n*   Venkat James'}]

In [18]:
response

{'response': 'Alright, Venkat! Let’s see… coming up with names that include “Venkat” is a fun challenge! Here are a few I’ve come up with, playing around with different combinations and styles:\n\n**Classic & Traditional:**\n\n*   Venkatraman\n*   Venkatavarasan\n*   Venkatakrishnan\n*   Venkataswamy \n\n**More Modern & Creative:**\n\n*   Venkat Sterling\n*   Venkat Hayes\n*   Venkat Stone\n*   Venkat Blake\n\n**Combining with other names (just for fun!):**\n\n*   Venkat James',
 'chat_history': [{'role': 'user',
   'content': 'Hi, My Name is Venkat, Please address me with my name always'},
  {'role': 'assistant',
   'content': 'Okay, Venkat. It’s nice to meet you! How can I help you today? 😊'},
  {'role': 'user',
   'content': 'Can you tell me some different names you know with my name in it?'},
  {'role': 'assistant',
   'content': 'Alright, Venkat! Let’s see… coming up with names that include “Venkat” is a fun challenge! Here are a few I’ve come up with, playing around with differen

In [19]:
print("USER:", CHAT_PROMPT2)
console.print(f"AI: {response.get('response')}", style='bold green')
print('*'*50)

USER: Can you tell me some different names you know with my name in it?


**************************************************


In [20]:
# Let's clear the history
model.clear_history()
model.history

[]

In [21]:
# Now let's try to continue the conversation without passing the history
response = model.chat(prompt=CHAT_PROMPT3, max_new_tokens=128)
model_dict['in_out_pairs'].append({'input': CHAT_PROMPT3, 'output': response.get('response'), 'method':'chat'})

Running inference for prompt:  <bos>
<start_of_turn>user
I hope you remember my name, can you tell my name?<end_of_turn>
<start_of_turn>model



In [22]:
# Let's check the model's history now after clearing history and invoking chat
model.history

[{'role': 'user',
  'content': 'I hope you remember my name, can you tell my name?'},
 {'role': 'assistant',
  'content': "As an AI, I don’t have memory of past conversations. So, I don't remember your name! 😊 \n\nYou haven’t told me your name yet. Would you like to tell me?"}]

In [23]:
# To clear the session and create new session we can pass clear_session
response = model.chat(prompt=CHAT_PROMPT1, max_new_tokens=128, clear_session=True)
model_dict['in_out_pairs'].append({'input': CHAT_PROMPT1, 'output': response.get('response'), 'method':'chat'})

Running inference for prompt:  <bos>
<start_of_turn>user
Hi, My Name is Venkat, Please address me with my name always<end_of_turn>
<start_of_turn>model



In [24]:
# Now let's ignore the chat state and continue the conversation from our local history we stored from our intial conversation
response = model.chat(prompt=CHAT_PROMPT3, max_new_tokens=128, chat_history=_chat_history)
model_dict['in_out_pairs'].append({'input': CHAT_PROMPT3, 'output': response.get('response'), 'method':'chat'})

Running inference for prompt:  <bos>
<start_of_turn>user
Hi, My Name is Venkat, Please address me with my name always<end_of_turn>
<start_of_turn>model
Okay, Venkat. It’s nice to meet you! How can I help you today? 😊<end_of_turn>
<start_of_turn>user
Can you tell me some different names you know with my name in it?<end_of_turn>
<start_of_turn>model
Alright, Venkat! Let’s see… coming up with names that include “Venkat” is a fun challenge! Here are a few I’ve come up with, playing around with different combinations and styles:

**Classic & Traditional:**

*   Venkatraman
*   Venkatavarasan
*   Venkatakrishnan
*   Venkataswamy 

**More Modern & Creative:**

*   Venkat Sterling
*   Venkat Hayes
*   Venkat Stone
*   Venkat Blake

**Combining with other names (just for fun!):**

*   Venkat James<end_of_turn>
<start_of_turn>user
I hope you remember my name, can you tell my name?<end_of_turn>
<start_of_turn>model



In [25]:
print("USER:", CHAT_PROMPT3)
console.print(f"AI: {response.get('response')}", style='bold green')
print('*'*50)

USER: I hope you remember my name, can you tell my name?


**************************************************


In [26]:
model.clear_history()
model.history

[]

### Stateless chat

- This is just a wrapper to chat method, the messages passed to this will not be store in the instance memory by default
- This is equvalant to using the chat method with chat_history
- But this method only accepts messages (which is similar to chat_history)

In [27]:
_chat_history += [
    {
        "role": "user",
        "content": "Great, now let's continue our name Search",
    }
]

In [28]:
_chat_history

[{'role': 'user',
  'content': 'Hi, My Name is Venkat, Please address me with my name always'},
 {'role': 'assistant',
  'content': 'Okay, Venkat. It’s nice to meet you! How can I help you today? 😊'},
 {'role': 'user',
  'content': 'Can you tell me some different names you know with my name in it?'},
 {'role': 'assistant',
  'content': 'Alright, Venkat! Let’s see… coming up with names that include “Venkat” is a fun challenge! Here are a few I’ve come up with, playing around with different combinations and styles:\n\n**Classic & Traditional:**\n\n*   Venkatraman\n*   Venkatavarasan\n*   Venkatakrishnan\n*   Venkataswamy \n\n**More Modern & Creative:**\n\n*   Venkat Sterling\n*   Venkat Hayes\n*   Venkat Stone\n*   Venkat Blake\n\n**Combining with other names (just for fun!):**\n\n*   Venkat James'},
 {'role': 'user',
  'content': 'I hope you remember my name, can you tell my name?'},
 {'role': 'assistant',
  'content': 'Of course, Venkat! I absolutely remember your name. Your name is Ve

In [29]:
response = model.stateless_chat(chat_history=_chat_history, max_new_tokens=128)
model_dict['in_out_pairs'].append({'input': _chat_history, 'output': response.get('response'), 'method':'stateless_chat'})

Running inference for prompt:  <bos>
<start_of_turn>user
Hi, My Name is Venkat, Please address me with my name always<end_of_turn>
<start_of_turn>model
Okay, Venkat. It’s nice to meet you! How can I help you today? 😊<end_of_turn>
<start_of_turn>user
Can you tell me some different names you know with my name in it?<end_of_turn>
<start_of_turn>model
Alright, Venkat! Let’s see… coming up with names that include “Venkat” is a fun challenge! Here are a few I’ve come up with, playing around with different combinations and styles:

**Classic & Traditional:**

*   Venkatraman
*   Venkatavarasan
*   Venkatakrishnan
*   Venkataswamy 

**More Modern & Creative:**

*   Venkat Sterling
*   Venkat Hayes
*   Venkat Stone
*   Venkat Blake

**Combining with other names (just for fun!):**

*   Venkat James<end_of_turn>
<start_of_turn>user
I hope you remember my name, can you tell my name?<end_of_turn>
<start_of_turn>model
Of course, Venkat! I absolutely remember your name. Your name is Venkat. 😊 

Is th

In [30]:
print("USER:", "Great, now let's continue our name Search")
console.print(f"AI: {response.get('response')}", style='bold green')

USER: Great, now let's continue our name Search


In [31]:
pprint(response.get('chat_history'))

[{'content': 'Hi, My Name is Venkat, Please address me with my name always',
  'role': 'user'},
 {'content': 'Okay, Venkat. It’s nice to meet you! How can I help you today? 😊',
  'role': 'assistant'},
 {'content': 'Can you tell me some different names you know with my name in '
             'it?',
  'role': 'user'},
 {'content': 'Alright, Venkat! Let’s see… coming up with names that include '
             '“Venkat” is a fun challenge! Here are a few I’ve come up with, '
             'playing around with different combinations and styles:\n'
             '\n'
             '**Classic & Traditional:**\n'
             '\n'
             '*   Venkatraman\n'
             '*   Venkatavarasan\n'
             '*   Venkatakrishnan\n'
             '*   Venkataswamy \n'
             '\n'
             '**More Modern & Creative:**\n'
             '\n'
             '*   Venkat Sterling\n'
             '*   Venkat Hayes\n'
             '*   Venkat Stone\n'
             '*   Venkat Blake\n'
            

In [32]:
# When calling this stateless_chat no history will be stored in the model instance
model.history

[]

### Brainstrom

- This method is mainly aimed for inferencing with llm in muti turn fashion

In [11]:
response = model.brainstrom(messages=messages, role='Britney', role_play_configs=role_play_configs, max_new_tokens=128, do_sample=True, temperature=0.7)

Running inference for prompt:  <bos>
<start_of_turn>user
You are in a group chat. Act as Britney.
your persona: A business client who values user experience and is focused on solving real-world problems for customers.

Following are the information about the perople and their personas in the group chat:
- Deepak: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
- Venkat: A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design.
- John: A detail-oriented tester who thrives on finding edge cases and ensuring product stability.<end_of_turn>
<start_of_turn>user
Deepak: Team, we have a new feature request from Britney. Let's discuss the feasibility and timeline.<end_of_turn>
<start_of_turn>model
Britney: Yes, I want to add an **advanced search feature** to our platform. Users are struggling to find relevant items quickly.<end_of_turn>
<start_of_tu

In [12]:
for m in response.get('chat_history'):
    console.print(m.get('role').upper(), ":", m.get('content'))
    print('*'*50)

**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


In [7]:
# Let's loop throgh the roles and get the responses and see how the conversation goes
#for m in messages:
#    print(m.get('role').upper(), ":", m.get('content'))
#    print('*'*50)
#_generated_responses = messages.copy()
#with Progress() as progress:
#    task = progress.add_task("Generating Responses...", total=2 * len(available_roles))

#    for loop_count in range(2):
#        for role in available_roles:
#            response = model.brainstrom(messages=_generated_responses, role=role, role_play_configs=role_play_configs, max_new_tokens=128)
#            _generated_responses.append({"role": role, "content": response.get('response')})
#            progress.update(task, advance=1)  # Update progress

In [8]:
#for m in messages:
#    console.print(m.get('role').upper(), ":", m.get('content'), style='red on white')
#    print('*'*50)

#print('-'*50)

#for r in _generated_responses:
#    console.print(r.get('role').upper(), ":", r.get('content'), style='bold green')
#    print('*'*50)

#### Gemma brainstrom

In [5]:
# multi turn brainstromming AI assisted conversation
responses = model.brainstrom(messages=messages, role_play_configs=role_play_configs, max_new_tokens=256, ai_assisted_turns=6)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running inference for prompt:  <bos>
<start_of_turn>user

You are a helpful AI assistant in a brainstorming session.
Your goal is to determine who should logically respond next, ensuring a smooth and productive discussion.

### Given Team and their Personas:
- Deepak: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
- Britney: A business client who values user experience and is focused on solving real-world problems for customers.
- Venkat: A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design.
- John: A detail-oriented tester who thrives on finding edge cases and ensuring product stability.

### Conversation History:
>> Deepak: Team, we have a new feature request from Britney. Let's discuss the feasibility and timeline.
>> Britney: Yes, I want to add an **advanced search feature** to our platform. Users are struggling to find relevant ite

In [6]:
for _ in responses.get('chat_history'):
    console.print(_.get('role').upper(), ":", _.get('content'))
    print('*'*50)

**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


In [5]:
model.model.config

Gemma3TextConfig {
  "_attn_implementation_autoset": true,
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": null,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "final_logit_softcapping": null,
  "head_dim": 256,
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 10240,
  "max_position_embeddings": 131072,
  "model_type": "gemma3_text",
  "num_attention_heads": 8,
  "num_hidden_layers": 34,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_local_base_freq": 10000.0,
  "rope_scaling": {
    "factor": 8.0,
    "rope_type": "linear"
  },
  "rope_theta": 1000000.0,
  "sliding_window": 1024,
  "sliding_window_pattern": 6,
  "transformers_version": "4.50.0.dev0",
  "use_cache": true,
  "vocab_size": 262208
}

#### LLAMA Brainstrom

In [5]:
# multi turn brainstromming AI assisted conversation
responses = model.brainstrom(messages=messages, role_play_configs=role_play_configs, max_new_tokens=256, ai_assisted_turns=6)

Running inference for prompt:  <|begin_of_text|>
<|start_header_id|>user<|end_header_id|>

You are a helpful AI assistant in a brainstorming session.
Your goal is to determine who should logically respond next, ensuring a smooth and productive discussion.

### Given Team and their Personas:
- Deepak: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
- Britney: A business client who values user experience and is focused on solving real-world problems for customers.
- Venkat: A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design.
- John: A detail-oriented tester who thrives on finding edge cases and ensuring product stability.

### Conversation History:
>> Deepak: Team, we have a new feature request from Britney. Let's discuss the feasibility and timeline.
>> Britney: Yes, I want to add an **advanced search feature** to our platform. Users ar

Invalid role 'Given the conversation history, the logical flow of discussion is' generated. Selecting next available role.


Generated response:  ['Given the conversation history, the logical flow of discussion is:\n\n1. Deepak initiated the discussion about a new feature request from Britney.\n2. Britney shared the feature request, and Venkat asked clarifying questions about the feature.\n3. Britney provided more details about the feature, and John asked about the complexity of the AI recommendations.\n\nSince John asked a question to Venkat, the logical next speaker should be Venkat.<|eot_id|>']
Running inference for prompt:  <|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are in a group chat. Act as Deepak.
Try to keep your response concise and aligned with your persona
your persona: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
Following are the information about the other people and their personas in the group chat:
- Britney: A business client who values user experience and is focused on solving r

Invalid role 'To maintain a smooth and productive discussion, the next speaker should be the team member who was asked a question by the last speaker. In this case, Britney was asked by Deepak for more context on her users' search history and data. Therefore, Britney should respond next.' generated. Selecting next available role.


Generated response:  ["To maintain a smooth and productive discussion, the next speaker should be the team member who was asked a question by the last speaker. In this case, Britney was asked by Deepak for more context on her users' search history and data. Therefore, Britney should respond next.<|eot_id|>"]
Running inference for prompt:  <|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are in a group chat. Act as Britney.
Try to keep your response concise and aligned with your persona
your persona: A business client who values user experience and is focused on solving real-world problems for customers.
Following are the information about the other people and their personas in the group chat:
- Deepak: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
- Venkat: A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design.
- John: A

In [6]:
for _ in responses.get('chat_history'):
    console.print(_.get('role').upper(), ":", _.get('content'))
    print('*'*50)

**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


In [7]:
model.model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.50.0.dev0",
  "use_cache": true,
  "vocab_size": 128256
}

#### Phi Brainstrom

In [5]:
# multi turn brainstromming AI assisted conversation
responses = model.brainstrom(messages=messages, role_play_configs=role_play_configs, max_new_tokens=256, ai_assisted_turns=6)

Running inference for prompt:  
<|user|>
You are a helpful AI assistant in a brainstorming session.
Your goal is to determine who should logically respond next, ensuring a smooth and productive discussion.

### Given Team and their Personas:
- Deepak: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
- Britney: A business client who values user experience and is focused on solving real-world problems for customers.
- Venkat: A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design.
- John: A detail-oriented tester who thrives on finding edge cases and ensuring product stability.

### Conversation History:
>> Deepak: Team, we have a new feature request from Britney. Let's discuss the feasibility and timeline.
>> Britney: Yes, I want to add an **advanced search feature** to our platform. Users are struggling to find relevant items quickly.
>> Ve

In [6]:
for _ in responses.get('chat_history'):
    console.print(_.get('role').upper(), ":", _.get('content'))
    print('*'*50)

**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


In [7]:
model.model.config

Phi3Config {
  "_attn_implementation_autoset": true,
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-4-mini-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-4-mini-instruct--modeling_phi3.Phi3ForCausalLM",
    "AutoTokenizer": "microsoft/Phi-4-mini-instruct--Xenova/gpt-4o"
  },
  "bos_token_id": 199999,
  "embd_pdrop": 0.0,
  "eos_token_id": 199999,
  "full_attn_mod": 1,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "interpolate_factor": 1,
  "lm_head_bias": false,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "phi3",
  "num_attention_heads": 24,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 199999,
  "partial_rotary_factor": 0.75,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scali

#### Qwen Brainstrom

In [9]:
# multi turn brainstromming AI assisted conversation
responses = model.brainstrom(messages=messages, role_play_configs=role_play_configs, max_new_tokens=256, ai_assisted_turns=6)

Running inference for prompt:  
<|im_start|>user

You are a helpful AI assistant in a brainstorming session.
Your goal is to determine who should logically respond next, ensuring a smooth and productive discussion.

### Given Team and their Personas:
- Deepak: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
- Britney: A business client who values user experience and is focused on solving real-world problems for customers.
- Venkat: A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design.
- John: A detail-oriented tester who thrives on finding edge cases and ensuring product stability.

### Conversation History:
>> Deepak: Team, we have a new feature request from Britney. Let's discuss the feasibility and timeline.
>> Britney: Yes, I want to add an **advanced search feature** to our platform. Users are struggling to find relevant items quick

In [10]:
for _ in responses.get('chat_history'):
    console.print(_.get('role').upper(), ":", _.get('content'))
    print('*'*50)

**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


In [11]:
model.model.config

Qwen2Config {
  "_attn_implementation_autoset": true,
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 32768,
  "max_window_layers": 70,
  "model_type": "qwen2",
  "num_attention_heads": 16,
  "num_hidden_layers": 36,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.50.0.dev0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

#### Gemini Brainstrom

In [5]:
# multi turn brainstromming AI assisted conversation
responses = model.brainstrom(messages=messages, role_play_configs=role_play_configs, max_new_tokens=256, ai_assisted_turns=6)

Generating response for prompt: 

 


You are a helpful AI assistant in a brainstorming session.
Your goal is to determine who should logically respond next, ensuring a smooth and productive discussion.

### Given Team and their Personas:
- Deepak: A meticulous and strategic manager who ensures projects run smoothly. He focuses on business goals, deadlines, and resource allocation.
- Britney: A business client who values user experience and is focused on solving real-world problems for customers.
- Venkat: A skilled developer with deep technical expertise. He prioritizes efficiency, clean code, and optimal system design.
- John: A detail-oriented tester who thrives on finding edge cases and ensuring product stability.

### Conversation History:
>> Deepak: Team, we have a new feature request from Britney. Let's discuss the feasibility and timeline.
>> Britney: Yes, I want to add an **advanced search feature** to our platform. Users are struggling to find relevant items quickly.
>> Venka

In [6]:
for _ in responses.get('chat_history'):
    console.print(_.get('role').upper(), ":", _.get('content'))
    print('*'*50)

**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************


**************************************************
