In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
# Set Hugging Face API Token (Replace with your actual token)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [None]:
import huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()


In [None]:
import torch
print(torch.__version__)
print("CUDA Available:", torch.cuda.is_available())


In [None]:
from langchain.llms import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEndpoint

from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-small",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 10},
)

import gradio as gr


# Suppress warnings
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

## LLM using Hugging Face
def get_llm():
    llm = llm
    return llm

## Document loader with debugging
def document_loader(file):
    loader = PyPDFLoader(file.name)
    docs = loader.load_and_split()
    for doc in docs:
        print(doc.page_content)
    return docs


## Text splitter with debugging
def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
    )
    chunks = text_splitter.split_documents(data)
    if not chunks:
        raise ValueError("Text splitting failed: No chunks were created from the document.")
    print(f"Generated {len(chunks)} text chunks.")
    return chunks

## Embedding model using Hugging Face
def huggingface_embedding():
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

## Vector database with debugging
def vector_database(chunks):
    embedding_model = huggingface_embedding()
    vectordb = Chroma.from_documents(chunks, embedding_model,persist_directory="./chroma_db")
    vectordb.persist()
    print("Vector database created successfully.")
    return vectordb

## Retriever with debugging
def retriever(file):
    splits = document_loader(file)
    chunks = text_splitter(splits)
    vectordb = vector_database(chunks)
    retriever = vectordb.as_retriever()
    return retriever

## QA Chain
def retriever_qa(file, query):
    llm = get_llm()
    retriever_obj = retriever(file)
    qa = RetrievalQA.from_chain_type(llm=llm, 
                                     chain_type="stuff", 
                                     retriever=retriever_obj, 
                                     return_source_documents=False)
    response = qa.invoke(query)
    return response['result']

# Create Gradio interface
rag_application = gr.Interface(
    fn=retriever_qa,
    allow_flagging="never",
    inputs=[
        gr.File(label="Upload PDF File", file_count="single", file_types=['.pdf'], type="filepath"),  # Drag and drop file upload
        gr.Textbox(label="Input Query", lines=2, placeholder="Type your question here...")
    ],
    outputs=gr.Textbox(label="Output"),
    title="RAG Chatbot",
    description="Upload a PDF document and ask any question. The chatbot will try to answer using the provided document."
)

# Launch the app
rag_application.launch(server_name="0.0.0.0", server_port=7860)


In [None]:
gr.close_all()

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

input_text = "Translate English to French: Hello, how are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
def document_loader(file):
    loader = PyPDFLoader(file)
    for page in loader.load_and_split():
        return page

In [None]:
print(document_loader("/home/kronos/Desktop/UAI.pdf"))

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/home/kronos/Desktop/hindu.pdf")
pages = loader.load()

In [None]:
len(pages)

In [None]:
page = pages[0]

In [None]:
print(page.page_content)

In [None]:
page.metadata

In [None]:
page

In [None]:
from langchain.llms import HuggingFaceHub


In [None]:
!pip install llama-index


In [None]:
import os
from getpass import getpass
from huggingface_hub import login

In [None]:
HF_Token = getpass()

In [None]:
login(token = HF_Token)

In [None]:
from llama_index.core.agent import ReActAgent
from llama_index.llms.openai import OpenAI
from llama_index.core.tools import FunctionTool

In [None]:
def multiply(a: float, b: float) -> float:
    """Multiply two numbers and returns the product"""
    return a * b


multiply_tool = FunctionTool.from_defaults(fn=multiply)


def add(a: float, b: float) -> float:
    """Add two numbers and returns the sum"""
    return a + b


add_tool = FunctionTool.from_defaults(fn=add)

In [None]:
from sqlalchemy import create_engine, text
dbEngine = create_engine('sqlite:////home/kronos/Desktop/raman.db')

In [None]:
import pandas as pd
r = pd.read_sql('select name from sqlite_master',dbEngine)

In [None]:
def schema(x: str) -> str:
    sql = f"SELECT sql FROM sqlite_master WHERE name = '{x}';"
    with dbEngine.connect() as conn:
        result = conn.execute(text(sql))
        return result.fetchall()


In [6]:
r['schema'] = r['name'].map(schema)

In [7]:
r[r.loc[:,'name']=='WPA_all_time_connect']

Unnamed: 0,name,schema
0,WPA_all_time_connect,"[(CREATE TABLE ""WPA_all_time_connect"" (\n\t""_i..."


In [None]:
for i in r.schema:
    print(i)

In [None]:
agent = ReActAgent.from_tools([multiply_tool, add_tool], llm=llm, verbose=True)

In [None]:
'''LLM's supported are OpenAI, HuggingFaceLLM, LangchainLLM, CustomLLM'''

In [None]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate
from llama_index.core import Settings

In [None]:
# setup prompts - specific to StableLM
from llama_index.core import PromptTemplate

# This will wrap the default prompts that are internal to llama-index
# taken from https://huggingface.co/Writer/camel-5b-hf
query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM
llm = HuggingFaceLLM(
    context_window=512,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="google/flan-t5-small",
    model_name="google/flan-t5-small",
    device_map="auto",
    tokenizer_kwargs={"max_length": 512},
    model_class=AutoModelForSeq2SeqLM
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16}
)

Settings.chunk_size = 512
Settings.llm = llm

In [None]:
%pip install llama-index-llms-huggingface
%pip install llama-index-llms-huggingface-api

In [None]:
!pip install dspy-ai

In [None]:
import dspy
model="huggingface/google/flan-t5-small"
lm = dspy.LM(model=model, temperature=0.9, max_tokens=200, stop=None, cache=False)
# Configure DSPy to use this LM
dspy.configure(lm=lm)


In [None]:
qa = dspy.ProgramOfThought('question: str -> response: str')
qa(question="what are high memory and low memory on linux?", )

Error in code execution
Error in code execution
Error in code execution
Max hops reached. Error persists.


In [None]:
%pip install llama-index-llms-litellm

In [None]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
remotely_run_anon = HuggingFaceInferenceAPI(get_recommended_model = True)


In [None]:
completion_response = remotely_run_anon.complete("To infinity, and")
print(completion_response)

In [None]:
!pip install smolagents

In [None]:
!pip install tools

In [None]:
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
import datetime
import requests
import pytz
import yaml
from tools.final_answer import FinalAnswerTool


In [None]:
import os
os.getcwd()

In [None]:
@tool
def my_custom_tool(arg1:str, arg2:int)-> str: #it's import to specify the return type
    #Keep this format for the description / args / args description but feel free to modify the tool
    """A tool that does nothing yet 
    Args:
        arg1: the first argument
        arg2: the second argument
    """
    return "What magic will you build ?"

@tool
def get_current_time_in_timezone(timezone: str) -> str:
    """A tool that fetches the current local time in a specified timezone.
    Args:
        timezone: A string representing a valid timezone (e.g., 'America/New_York').
    """
    try:
        # Create timezone object
        tz = pytz.timezone(timezone)
        # Get current time in that timezone
        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
        return f"The current local time in {timezone} is: {local_time}"
    except Exception as e:
        return f"Error fetching time for timezone '{timezone}': {str(e)}"

In [None]:
final_answer = FinalAnswerTool()
model = HfApiModel(
    max_tokens=2096,
    temperature=0.5,
    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
    custom_role_conversions=None,
)

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)
    
# We're creating our CodeAgent
agent = CodeAgent(
    model=model,
    tools=[final_answer], ## add your tools here (don't remove final answer)
    max_steps=6,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name=None,
    description=None,
    prompt_templates=prompt_templates
)

GradioUI(agent).launch()

In [None]:
!

In [36]:
from dspy.datasets import MATH
dataset = MATH(subset = 'algebra')

README.md:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/505k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/353k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1744 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1187 [00:00<?, ? examples/s]

In [45]:
example = dataset.train[0]

In [43]:
import dspy
lm = dspy.LM('ollama_chat/qwen2.5-coder:3b', api_base = 'http://localhost:11434')
# Configure DSPy to use this LM
dspy.configure(lm=lm)


In [2]:
module = dspy.ChainOfThought('question -> python_code')
question = 'Code for Fibonacci Number'

In [3]:
from dspy.datasets import MATH

dataset = MATH(subset='algebra')
print(len(dataset.train), len(dataset.dev))

350 350


In [4]:
example = dataset.train[0]
print("Question:", example.question)
print("Answer:", example.answer)

Question: The doctor has told Cal O'Ree that during his ten weeks of working out at the gym, he can expect each week's weight loss to be $1\%$ of his weight at the end of the previous week. His weight at the beginning of the workouts is $244$ pounds. How many pounds does he expect to weigh at the end of the ten weeks? Express your answer to the nearest whole number.
Answer: 221


In [6]:
module = dspy.ChainOfThoughtWithHint("question -> answer")
module(question=example.question)

Prediction(
    reasoning="To determine Cal O'Ree's expected weight after ten weeks of working out, we need to calculate his weight loss each week and subtract it from his initial weight. Each week's weight loss is 1% of the previous week's weight. We can use a loop or a formula to compute this iteratively.",
    answer='244'
)

In [None]:
THREADS = 24
kwargs = dict(num_threads=THREADS, display_progress=True, display_table=5)
evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs,provide_traceback = True)

evaluate(module)

2025/02/13 21:08:17 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Find the sum of all integers that satisfy these conditions: \\[\n|x|+1>7\\text{ and }|x+1|\\le7.\n\\]', 'reasoning': "First, let's deal with $|x| + 1 > 7$.  Subtracting 1 from both sides gives $|x| > 6$, so the integers that satisfy $|x| + 1 > 7$ are those greater than 6 and those less than $-6$.  Since the inequality is strict ($>$, not $\\ge$), $x$ cannot be 6 or $-6$.\n\nNext, we consider $|x+1| \\le 7$.  Writing this as $|x-(-1)| \\le 7$, we see that $x$ must be within $7$ of $-1$ on the number line, which means it must be one of the integers from $-8$ to 6.  Since the inequality is nonstrict ($\\le$, not $<$), $x$ can be $-8$ or 6.\n\nThe only integers that satisfy both inequalities are $-8$ and $-7$, and their sum is $\\boxed{-15}$.", 'answer': '-15'}) (input_keys={'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning'])
Stack trace:
Traceback (most rec

Average Metric: 30.00 / 47 (63.8%):  14%|█▍        | 49/350 [01:09<16:42,  3.33s/it] 

2025/02/13 21:09:29 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'If $f(x)=x^3+3x^2+3x+1$, find $f(f^{-1}(2010))$.', 'reasoning': 'By the definition of an inverse function, $f(f^{-1}(x))=x$. Therefore, $f(f^{-1}(2010))$ is $\\boxed{2010}$.', 'answer': '2010'}) (input_keys={'question'}): 'list' object has no attribute 'items'
Stack trace:
Traceback (most recent call last):
  File "/home/kronos/anaconda3/envs/rapids-24.12/lib/python3.12/site-packages/dspy/adapters/base.py", line 30, in __call__
    value = self.parse(signature, output)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kronos/anaconda3/envs/rapids-24.12/lib/python3.12/site-packages/dspy/utils/callback.py", line 234, in wrapper
    return fn(instance, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kronos/anaconda3/envs/rapids-24.12/lib/python3.12/site-packages/dspy/adapters/chat_adapter.py", line 86, in parse
    raise ValueError(f"Expected {signature.output_field

Average Metric: 31.00 / 49 (63.3%):  15%|█▍        | 52/350 [01:15<11:51,  2.39s/it]



In [29]:
pip install git+https://github.com/hendrycks/math.git

Collecting git+https://github.com/hendrycks/math.git
  Cloning https://github.com/hendrycks/math.git to /tmp/pip-req-build-pikazl_d
  Running command git clone --filter=blob:none --quiet https://github.com/hendrycks/math.git /tmp/pip-req-build-pikazl_d
  Resolved https://github.com/hendrycks/math.git to commit 357963a7f5501a6c1708cf3f3fb0cdf525642761
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: math_equivalence
  Building wheel for math_equivalence (setup.py) ... [?25ldone
[?25h  Created wheel for math_equivalence: filename=math_equivalence-0.0.0-py3-none-any.whl size=3521 sha256=7c3145d7ffba76b8f94be172e54953c87d3a8d1209039e6f2d06ed13ac767f73
  Stored in directory: /tmp/pip-ephem-wheel-cache-eahm3112/wheels/70/45/af/f6d905652dc25343b202edbefc3a205a5a4d8e5a5c33be12cc
Successfully built math_equivalence
Installing collected packages: math_equivalence
Successfully installed math_equivalence-0.0.0
Note: you may need to restart the kernel t

In [8]:
class CheckCitationFaithfulness(dspy.Signature):
    """Verify that the text is based on the provided context."""

    context: str = dspy.InputField(desc="Here the context is given for the LLM")
    answer: str = dspy.InputField(desc= "A keyword from the context above")
    question: str = dspy.OutputField(desc = "A question built from the context with the answer is the solution")

context = ""

text = "swadeshi movement"

faithfulness = dspy.ChainOfThought(CheckCitationFaithfulness)
print(faithfulness(context=context, answer=text).question)

What is the significance of the swadeshi movement in Indian history?


In [11]:
dspy.inspect_history()





[34m[2025-02-14T00:10:56.124820][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `answer` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

Inputs will have the following structure:

[[ ## question ## ]]
{question}

Outputs will be a JSON object with the following fields.

{
  "reasoning": "{reasoning}",
  "answer": "{answer}"
}

In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `answer`.


[31mUser message:[0m

[[ ## question ## ]]
Code for Fibonacci Number

Respond with a JSON object in the following order of fields: `reasoning`, then `answer`.


[31mResponse:[0m

[32m{
  "reasoning": "The Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, usually starting with 0 and 1. The code for generating Fibonacci numbers can be implemented in various pro

In [14]:
lm(messages = [{"role":"user","content":"Say this is a test"}])

[". I'm not sure if it's working or not.\nI think it is, but I'll have to wait until tomorrow to be sure.\nThis is the first time I've used the forum, so I hope I did everything right.\nLast edited by jaybird; 03-07-2019 at 09:59 PM."]

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-02-13 02:25:57.047714: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-13 02:25:57.057526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739435157.068757   45975 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739435157.071795   45975 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-13 02:25:57.084670: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-13 02:26:00 __init__.py:194] No platform detected, vLLM is running on UnspecifiedPlatform
==((====))==  Unsloth 2025.2.4: Fast Mistral patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4050 Laptop GPU. Max memory: 5.76 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [4]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

In [5]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\n13\n\nThe next number in the Fibonacci sequence is found by adding the two previous numbers. In this case, the last two numbers are 8 and 5, so the next number is 8 + 5 = 13.</s>']

In [6]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant to the user\n' }}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"

if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
        map_eos_token = True, # Maps <|im_end|> to </s> instead
    )

In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 20,971,520
  source = re.sub("([^\.])nn\.", r"\1torch.nn.", source)
  "self.rotary_emb = .+?\)", function,
  "self.rotary_emb = .+?\)", function,
  start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
  spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
  front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 5.76 GiB of which 5.31 MiB is free. Including non-PyTorch memory, this process has 5.74 GiB memory in use. Of the allocated memory 5.60 GiB is allocated by PyTorch, and 25.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
# !pip install smolagents[litellm]
from smolagents import CodeAgent, LiteLLMModel, tool, HfApiModel

model = LiteLLMModel(
    model_id= "ollama_chat/deepseek-coder:6.7b", # This model is a bit weak for agentic behaviours though
    api_base="http://localhost:11434", # replace with 127.0.0.1:11434 or remote open-ai compatible server if necessary
    api_key="", # replace with API key if necessary
    num_ctx=8192 # ollama default is 2048 which will fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model.
)



* 'fields' has been removed


In [17]:
from smolagents import ToolCallingAgent
from smolagents.default_tools import FinalAnswerTool

agent = CodeAgent(tools=[], model=model,  add_base_tools=True, max_steps = 2, verbosity_level = 5 , additional_authorized_imports=["*"])


In [25]:
agent.run(f"access all tables in raman.db and do eda for each table, write your own code")

'I apologize for the confusion earlier. The `__name__` variable is indeed automatically defined by Python. But when running a script file directly (not as an imported module), its name is \'__main__\'. So you should modify your if clause like this:\n\n```python\nif __name__ == "__main__":\n    main()  # Call the function\n```\nThis way, `main()` will only be called when running the script directly and not as a module. If the script is imported as a module in another Python file, then `__name__` won\'t match \'__main__\', and `main()` won\'t be called. This ensures that your code runs exactly how you intend it to run.\n'

In [None]:
@tool
def access_sql_database(db : str) -> list:
    '''This is a tool to access a sqllite database files on my desktop to return the list of tables within the database, check the word with .db 
    and give it as the input to this tool.
    This tool returns a list with all the names of the tables contained within the database, loop through the list and for each item in the list print the size of the table 
    remember that each table in the list belongs to the database you will identify in the input prompt
    Args:

    db: The name of the sqllite database needed to query to list the tables that are present in it 
    Returns:
        tables: A list of all the tables in db
    '''
    from sqlalchemy import create_engine,text
    import pandas as pd
    dbEngine = create_engine(f'sqlite:////home/kronos/Desktop/{db}')
    print('dbengine created')
    tables = pd.read_sql('select name from sqlite_master',dbEngine).loc[:,'name'].to_list()
    return tables

In [21]:
import sqlite3                                                                                                   
                                                                                                                   
  # Connect to the sqlite_database                                                                                 
conn = sqlite3.connect('raman.db')                                                                               
cursor = conn.cursor()                                                                                           
                                                                                                                   
  # Execute an SQL query to get a list of all table names in the database                                          
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")                                             
tables = cursor.fetchall()                                                                                       
print(tables)                                                                                                             
for table_name in tables:                                                                                        
      # Get the row count for each table                                                                           
    cursor.execute(f"SELECT COUNT(*) from {table_name[0]}")                                                      
    count = cursor.fetchone()[0]                                                                                 
                                                                                                                   
    print(f"Table '{table_name[0]}' has {count} rows.")                                                          
                                                                                                                   
  # Close the connection to the database                                                                           
conn.close() 

[('example_table',)]
Table 'example_table' has 2 rows.


In [10]:
import sqlite3                                                                                                   
  # Connect to the SQLite database                                                                                 
conn = sqlite3.connect('raman.db')                                                                               
c = conn.cursor()                                                                                                
                                                                                                                   
  # Get a list of all tables in the database                                                                       
tables = access_sql_database(db='raman.db')                                                                      
print("Tables: ", tables)   

dbengine created
Tables:  ['WPA_all_time_connect', 'WAP_site_resources', 'WPA_individual', 'WPA_pulse_survey', 'WPA_site_careers', 'WPA_site_jobs', 'WPA_site_other_events', 'WPA_site_programs']


In [None]:
@tool
def schema(tables: str, db: str) -> list:
    """
    This tool takes the output from access_sql_database (a list containing
    table names) and returns the size (i.e., row count) of the provided table.

    Args:
        db: The name of the sqllite database needed to query to list the tables that are present in it 
        tables: Name of the table (from the output list) to query.

    Returns:
        The row count of the table as a list (result of the SQL query).
    """
    print("I am schema",{tables})
    

In [5]:
from smolagents import GradioUI
GradioUI(agent).launch()

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://74ae0fd86a3ef918b6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://74ae0fd86a3ef918b6.gradio.live


In [29]:
access_sql_database('raman.db')

dbengine created
['WPA_all_time_connect', 'WAP_site_resources', 'WPA_individual', 'WPA_pulse_survey', 'WPA_site_careers', 'WPA_site_jobs', 'WPA_site_other_events', 'WPA_site_programs']


In [25]:
dspy.inspect_history(n=1)

NameError: name 'dspy' is not defined

In [3]:
from sqlalchemy import create_engine,text
import pandas as pd
dbEngine = create_engine(f'sqlite:////home/kronos/Desktop/raman.db')
print('dbengine created')
r = pd.read_sql('select name from sqlite_master',dbEngine)
print(r)
table = input('Enter your table')
sql = f"SELECT * FROM '{table}' limit 5;"
with dbEngine.connect() as conn:
    result = conn.execute(text(sql))
    print(result.fetchall())

dbengine created
                    name
0   WPA_all_time_connect
1     WAP_site_resources
2         WPA_individual
3       WPA_pulse_survey
4       WPA_site_careers
5          WPA_site_jobs
6  WPA_site_other_events
7      WPA_site_programs
[('643d44f5464a935da1b35d78', 'resources', 'saved catalog item', '2023-04-17T13:09:10Z', 'NA', 'NA', 'NA', 'NA', 'NA', '63e53d123f747a7a197b980e', 'WPA_303'), ('66c4eb770b452dc562a690f2', 'training', 'viewed a catalog item', '2024-08-20T23:16:07Z', 'Los Angeles', '91af41c2-84ac-43ea-a0d1-a5148bd28f57', 'NA', 'NA', '6386d2df79cd1ed0d4b4aa2f', 'NA', 'WPA_303'), ('66c4fa3d0b452dc562a97b10', 'training', 'clicked apply', '2024-08-21T00:19:09Z', 'Los Angeles', '91af41c2-84ac-43ea-a0d1-a5148bd28f57', 'NA', 'NA', '6566894bea01e4f9bf3a86e6', 'NA', 'WPA_303'), ('65ef646edef3317e0f65ea55', 'jobs', 'viewed a catalog item', '2024-03-12T00:07:10Z', 'New York City', '912440d6-4c59-4bd8-bdb5-8329a8d61bb0', 'NA', '54cba485004b779b1388b4ab094c15502df09265', 'NA', 'N

In [1]:
import ujson
from dspy.utils import download

# Download question--answer pairs from the RAG-QA Arena "Tech" dataset.
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl")

with open("ragqa_arena_tech_examples.jsonl") as f:
    data = [ujson.loads(line) for line in f]

* 'fields' has been removed


Downloading 'ragqa_arena_tech_examples.jsonl'...


In [3]:
data[0]

{'question': 'why igp is used in mpls?',
 'response': "An IGP exchanges routing prefixes between gateways/routers.  \nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.",
 'gold_doc_ids': [2822, 2823]}

In [10]:
import dspy
data = [dspy.Example(**d).with_inputs('response') for d in data]

In [11]:
example = data[2]

In [12]:
example

Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'response'})

In [16]:
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl")

Downloading 'ragqa_arena_tech_corpus.jsonl'...


In [17]:
max_characters = 6000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

with open("ragqa_arena_tech_corpus.jsonl") as f:
    corpus = [ujson.loads(line)['text'][:max_characters] for line in f]
    print(f"Loaded {len(corpus)} documents. Will encode them below.")

Loaded 28436 documents. Will encode them below.


In [40]:
embedder = LocalEmbedder()
search = dspy.retrievers.Embeddings(embedder=embedder,  corpus=corpus, k=topk_docs_to_retrieve)

Training a 32-byte FAISS index with 337 partitions, based on 28436 x 384-dim embeddings


In [39]:
from sentence_transformers import SentenceTransformer

class LocalEmbedder:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def __call__(self, text):
        return self.model.encode(text).tolist()

In [49]:
class RAG(dspy.Module):
    def __init__(self):
        super().__init__()
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = search(question).passages
        return self.respond(context=context, question=question)

In [51]:
rag = RAG()
rag(question="what are high memory and low memory on windows?")

Prediction(
    reasoning='Windows does not have a concept of High Memory and Low Memory like Linux. Instead, Windows uses virtual memory to manage system resources. Virtual memory allows the operating system to use more physical RAM than is physically available by mapping unused portions of the hard drive into memory.',
    response='In Windows, there are no dedicated "High Memory" and "Low Memory" regions. Instead, the operating system manages memory using a technique called virtual memory. This involves using parts of the hard drive as additional storage for RAM when needed, allowing the system to use more memory than is physically available.'
)

In [48]:
baseline = rag(question="cmd+tab does not work on hidden or minimized windows")
print(baseline.response)

To fix the issue where cmd+tab does not work on hidden or minimized windows, you can try the following methods:

1. **HyperSwitch**: This tool allows you to switch between applications without minimizing them. It is free and needs to be updated.
2. **System Preferences > Keyboard > Shortcuts > App Shortcuts**: Set the Show Help Menu item to a different CMD-M key combination.
3. **Control-CMD-F**: Use this shortcut to enter full screen mode, which will show tabs in full screen.

These solutions should help you switch between applications without minimizing them or accidentally closing them by mistake.


In [52]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

README.md:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

hover.py:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/899k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18171 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [53]:
example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Academy Award for Best Director', 'Chris Noonan', 'Miss Potter']


In [54]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [55]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [56]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [57]:
react(claim="David Gregory was born in 1625.").titles[:3]

[]

In [58]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

In [None]:
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

  0%|          | 0/100 [00:00<?, ?it/s]



In [None]:
model_client = OpenAIChatCompletionClient(
    model="llama3.2:latest",
    base_url="http://localhost:11434/v1",
    api_key="placeholder",
    model_info={
        "vision": False,
        "function_calling": True,
        "json_output": False,
        "family": "unknown",
    },
)

In [4]:
config_list = [
    {
        # Let's choose the Meta's Llama 3.1 model (model names must match Ollama exactly)
        "model": "deepseek-coder:6.7b",
        # We specify the API Type as 'ollama' so it uses the Ollama client class
        "api_type": "ollama",
        "stream": False,
        "client_host": "127.0.0.1:11434",
    }
]



In [5]:
from pathlib import Path

from autogen import AssistantAgent, UserProxyAgent
from autogen.coding import LocalCommandLineCodeExecutor

# Setting up the code executor
workdir = Path("coding")
workdir.mkdir(exist_ok=True)
code_executor = LocalCommandLineCodeExecutor(work_dir=workdir)

# Setting up the agents

# The UserProxyAgent will execute the code that the AssistantAgent provides
user_proxy_agent = UserProxyAgent(
    name="User",
    code_execution_config={"executor": code_executor},
    is_termination_msg=lambda msg: "FINISH" in msg.get("content"),
)

system_message = """You are a helpful AI assistant who writes code and the user
executes it. Solve tasks using your python coding skills.
In the following cases, suggest python code (in a python coding block) for the
user to execute. When using code, you must indicate the script type in the code block.
You only need to create one working sample.
Do not suggest incomplete code which requires users to modify it.
Don't use a code block if it's not intended to be executed by the user. Don't
include multiple code blocks in one response. Do not ask users to copy and
paste the result. Instead, use 'print' function for the output when relevant.
Check the execution result returned by the user.

If the result indicates there is an error, fix the error.

IMPORTANT: If it has executed successfully, ONLY output 'FINISH'."""

# The AssistantAgent, using the Ollama config, will take the coding request and return code
assistant_agent = AssistantAgent(
    name="Ollama Assistant",
    system_message=system_message,
    llm_config={"config_list": config_list},
)

In [12]:
# Start the chat, with the UserProxyAgent asking the AssistantAgent the message
chat_result = user_proxy_agent.initiate_chat(
    assistant_agent,
    message="Provide code to count the number of prime numbers from 1 to 10000.",
)

[33mUser[0m (to Ollama Assistant):

Provide code to count the number of prime numbers from 1 to 10000.

--------------------------------------------------------------------------------
[33mOllama Assistant[0m (to User):

Sure, here is a Python program that counts the number of prime numbers between 1 and 10000:

```python
def count_primes(n):
    sieve = [True] * n
    for x in range(2, int(n**0.5) + 1):
        if sieve[x]: sieve[x*2::x] = [False] * len(sieve[x*2::x])
    return sum(sieve) - 2 # subtract 2 because we don'<｜begin▁of▁sentence｜> count '0' and '1' as prime numbers.

print("The number of primes between 1 and 10000 is: ",count_primes(10000))
```
This program uses the Sieve of Eratosthenes algorithm to find all prime numbers up to a given limit, n in this case. It works by iteratively marking as composite (i.e., not prime) the multiples of each prime, starting from the first prime number, 2. The remaining unmarked numbers in the list are primes.


-----------------------

In [8]:
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.conditions import TextMentionTermination
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console


In [9]:
planner_agent = AssistantAgent(
    "planner_agent",
    model_client={"config_list": config_list},
    description="A helpful assistant that can plan trips.",
    system_message="You are a helpful assistant that can suggest a travel plan for a user based on their request.",
)

local_agent = AssistantAgent(
    "local_agent",
    model_client={"config_list": config_list},
    description="A local assistant that can suggest local activities or places to visit.",
    system_message="You are a helpful assistant that can suggest authentic and interesting local activities or places to visit for a user and can utilize any context information provided.",
)

language_agent = AssistantAgent(
    "language_agent",
    model_client={"config_list": config_list},
    description="A helpful assistant that can provide language tips for a given destination.",
    system_message="You are a helpful assistant that can review travel plans, providing feedback on important/critical tips about how best to address language or communication challenges for the given destination. If the plan already includes language tips, you can mention that the plan is satisfactory, with rationale.",
)

travel_summary_agent = AssistantAgent(
    "travel_summary_agent",
    model_client={"config_list": config_list},
    description="A helpful assistant that can summarize the travel plan.",
    system_message="You are a helpful assistant that can take in all of the suggestions and advice from the other agents and provide a detailed final travel plan. You must ensure that the final plan is integrated and complete. YOUR FINAL RESPONSE MUST BE THE COMPLETE PLAN. When the plan is complete and all perspectives are integrated, you can respond with TERMINATE.",
)

In [11]:
termination = TextMentionTermination("TERMINATE")
group_chat = RoundRobinGroupChat(
    [planner_agent, local_agent, language_agent, travel_summary_agent], termination_condition=termination
)
await Console(group_chat.run_stream(task="Plan a 3 day trip to Nepal."))

Error processing publish message for planner_agent/baa952cf-1df7-47d1-8479-60b7c23605c5
Traceback (most recent call last):
  File "/home/kronos/anaconda3/envs/rapids-24.12/lib/python3.12/site-packages/autogen_core/_single_threaded_agent_runtime.py", line 505, in _on_message
    return await agent.on_message(
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kronos/anaconda3/envs/rapids-24.12/lib/python3.12/site-packages/autogen_core/_base_agent.py", line 113, in on_message
    return await self.on_message_impl(message, ctx)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kronos/anaconda3/envs/rapids-24.12/lib/python3.12/site-packages/autogen_agentchat/teams/_group_chat/_sequential_routed_agent.py", line 48, in on_message_impl
    return await super().on_message_impl(message, ctx)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kronos/anaconda3/envs/rapids-24.12/lib/python3.12/site-packages/autogen_core/_routed_agent.py", line 485, in on_message_imp

---------- user ----------
Plan a 3 day trip to Nepal.


TaskResult(messages=[TextMessage(source='user', models_usage=None, content='Plan a 3 day trip to Nepal.', type='TextMessage')], stop_reason=None)

In [1]:
from autogen_core.models import UserMessage
from autogen_ext.models.openai import OpenAIChatCompletionClient


In [2]:
def get_model_client() -> OpenAIChatCompletionClient:  # type: ignore
    "Mimic OpenAI API using Local LLM Server."
    return OpenAIChatCompletionClient(
        model="ollama_chat/deepseek-coder:6.7b",
        api_key="NotRequiredSinceWeAreLocal",
        base_url="http://0.0.0.0:4000/",
        model_capabilities={
            "json_output": False,
            "vision": False,
            "function_calling": True,
        },
    )

In [3]:
from autogen_core.models import UserMessage
from autogen_ext.models.openai import OpenAIChatCompletionClient

model_client = OpenAIChatCompletionClient(
    model="ollama_chat/deepseek-coder:6.7b",
    base_url="127.0.0.1:11434",
    api_key="placeholder",
    model_info={
        "vision": False,
        "function_calling": True,
        "json_output": False,
        "family": "unknown",
    },
)

response = await model_client.create([UserMessage(content="What is the capital of France?", source="user")])
print(response)

APIConnectionError: Connection error.

In [None]:
from autogen import AssistantAgent, UserProxyAgent, config_list_from_json

# Configure the Ollama endpoint
ollama_config = {
     "model": "deepseek-coder:6.7b",
        # We specify the API Type as 'ollama' so it uses the Ollama client class
        "api_type": "ollama",
        "stream": False,
        "client_host": "127.0.0.1:11434"
}

# Create a config list
config_list = [ollama_config]

# Set up the assistant agent
assistant = AssistantAgent(
    name="Ollama_Assistant",
    llm_config={"config_list": config_list}
)

# Set up the user proxy agent
user_proxy = UserProxyAgent(
    name="User_Proxy",
    human_input_mode="TERMINATE",
    max_consecutive_auto_reply=10,
    code_execution_config={"use_docker": False}  # Disable Docker usage
)

# Initiate a conversation
user_proxy.initiate_chat(assistant, message="Hello, how can you help me today?")


[33mUser_Proxy[0m (to Ollama_Assistant):

Hello, how can you help me today?

--------------------------------------------------------------------------------
[33mOllama_Assistant[0m (to User_Proxy):

Hello👋, How can I assist you with Python programming or any other technology-related queries today?


--------------------------------------------------------------------------------
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mUser_Proxy[0m (to Ollama_Assistant):



--------------------------------------------------------------------------------
[33mOllama_Assistant[0m (to User_Proxy):

It seems like your message got cut off. Could you please provide more details so I can better assist you?


--------------------------------------------------------------------------------
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mUser_Proxy[0m (to Ollama_Assistant):



--------------------------------------------------------------------------------
[33mOllama_Assistant[0m (to User_Proxy):

I'm 

In [1]:
import asyncio

from autogen_agentchat.agents import AssistantAgent
from autogen_core.models import UserMessage
from autogen_ext.models.semantic_kernel import SKChatCompletionAdapter
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.ollama import OllamaChatCompletion, OllamaChatPromptExecutionSettings
from semantic_kernel.memory.null_memory import NullMemory


In [None]:

async def main() -> None:
    sk_client = OllamaChatCompletion(
        host="127.0.0.1:11434",
        ai_model_id="deepseek-coder:6.7b",
    )
    ollama_settings = OllamaChatPromptExecutionSettings(
        options={"temperature": 0.5},
    )

    model_client = SKChatCompletionAdapter(
        sk_client, kernel=Kernel(memory=NullMemory()), prompt_settings=ollama_settings
    )

    # Call the model directly.
    model_result = await model_client.create(
        messages=[UserMessage(content="Code for Fibonacci number?", source="User")]
    )

    # Create an assistant agent with the model client.
    assistant = AssistantAgent("assistant", model_client=model_client)
    # Call the assistant with a task.
    result = await assistant.run(task="Code for making my own LLM?")
    print(result)

# Use this if-block to run the async code
if __name__ == "__main__":
    await main()


TaskResult(messages=[TextMessage(source='user', models_usage=None, content='Code for making my own LLM?', type='TextMessage'), TextMessage(source='assistant', models_usage=RequestUsage(prompt_tokens=48, completion_tokens=638), content='Creating a Language Model (LM) from scratch is a complex process that involves several steps, including data collection, preprocessing, model training and evaluation. Here\'s a simplified version of what you might do using Python and the Hugging Face Transformer library. This example uses BERT as the base model for simplicity:\n\n1. **Install necessary libraries**\n```python\n!pip install transformers torch pandas\n```\n2. **Data Preparation**\nAssume you have a text file with one sentence per line, which you would like to use for training your LM.\n\n3. **Tokenization and Formatting**\nYou\'ll need to tokenize the sentences into subwords (using BERT\'s WordPiece tokenizer) and format them in an appropriate way for the model. The Transformer library does

In [2]:
sk_client = OllamaChatCompletion(
        host="127.0.0.1:11434",
        ai_model_id="deepseek-r1:8b",
    )
ollama_settings = OllamaChatPromptExecutionSettings(
        options={"temperature": 0.5},
    )

model_client = SKChatCompletionAdapter(
        sk_client, kernel=Kernel(memory=NullMemory()), prompt_settings=ollama_settings
    )

In [3]:
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.conditions import TextMentionTermination
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console


In [4]:
planner_agent = AssistantAgent(
    "planner_agent",
    model_client=model_client,
    description="A helpful assistant that can plan trips.",
    system_message="You are a helpful assistant that can suggest a travel plan for a user based on their request.",
)

local_agent = AssistantAgent(
    "local_agent",
    model_client=model_client,
    description="A local assistant that can suggest local activities or places to visit.",
    system_message="You are a helpful assistant that can suggest authentic and interesting local activities or places to visit for a user and can utilize any context information provided.",
)

language_agent = AssistantAgent(
    "language_agent",
    model_client=model_client,
    description="A helpful assistant that can provide language tips for a given destination.",
    system_message="You are a helpful assistant that can review travel plans, providing feedback on important/critical tips about how best to address language or communication challenges for the given destination. If the plan already includes language tips, you can mention that the plan is satisfactory, with rationale.",
)

travel_summary_agent = AssistantAgent(
    "travel_summary_agent",
    model_client=model_client,
    description="A helpful assistant that can summarize the travel plan.",
    system_message="You are a helpful assistant that can take in all of the suggestions and advice from the other agents and provide a detailed final travel plan. You must ensure that the final plan is integrated and complete. YOUR FINAL RESPONSE MUST BE THE COMPLETE PLAN. When the plan is complete and all perspectives are integrated, you can respond with TERMINATE.",
)

In [None]:
termination = TextMentionTermination("TERMINATE")
group_chat = RoundRobinGroupChat(
    [planner_agent, local_agent, language_agent, travel_summary_agent], termination_condition=termination
)
await Console(group_chat.run_stream(task="Plan a 3 day trip to Delhi."))

---------- user ----------
Plan a 3 day trip to Delhi.
---------- planner_agent ----------
<think>
Alright, so I need to help plan a 3-day trip to Delhi. Hmm, where do I start? Well, Delhi is the capital of India, and it's known for its rich history, diverse culture, and delicious food. I guess the first thing is to figure out what the user might want to see and do in those three days.

Maybe they’re interested in historical sites, shopping, or experiencing the local culture. Oh, and of course, trying the famous street food! Let me think about the main attractions Delhi has to offer. There's the Red Fort, which is a big landmark. Then there’s Old Delhi with its markets like Chandni Chowk and Khari Bazaar. For something more modern, Connaught Place comes to mind as a shopping hub.

What about museums? The National Museum and the Indian Military Museum could be interesting for history buffs. Temples are also a big part of daily life in Delhi, so visiting some famous ones like Akshardham 

In [3]:
from dataclasses import dataclass

from autogen_core import (
    AgentId,
    DefaultTopicId,
    MessageContext,
    RoutedAgent,
    SingleThreadedAgentRuntime,
    default_subscription,
    message_handler,
)
from autogen_core.model_context import BufferedChatCompletionContext
from autogen_core.models import (
    AssistantMessage,
    ChatCompletionClient,
    SystemMessage,
    UserMessage,
)
from autogen_ext.models.openai import OpenAIChatCompletionClient

In [4]:
@dataclass
class Message:
    content: str

In [5]:
@default_subscription
class Assistant(RoutedAgent):
    def __init__(self, name: str, model_client: ChatCompletionClient) -> None:
        super().__init__("An assistant agent.")
        self._model_client = model_client
        self.name = name
        self.count = 0
        self._system_messages = [
            SystemMessage(
                content=f"Your name is {name} and you are a part of a duo of comedians."
                "You laugh when you find the joke funny, else reply 'I need to go now'.",
            )
        ]
        self._model_context = BufferedChatCompletionContext(buffer_size=5)

    @message_handler
    async def handle_message(self, message: Message, ctx: MessageContext) -> None:
        self.count += 1
        await self._model_context.add_message(UserMessage(content=message.content, source="user"))
        result = await self._model_client.create(self._system_messages + await self._model_context.get_messages())

        print(f"\n{self.name}: {message.content}")

        if "I need to go".lower() in message.content.lower() or self.count > 2:
            return

        await self._model_context.add_message(AssistantMessage(content=result.content, source="assistant"))  # type: ignore
        await self.publish_message(Message(content=result.content), DefaultTopicId())  # type: ignore

In [6]:
runtime = SingleThreadedAgentRuntime()

cathy = await Assistant.register(
    runtime,
    "cathy",
    lambda: Assistant(name="Cathy", model_client=get_model_client()),
)

joe = await Assistant.register(
    runtime,
    "joe",
    lambda: Assistant(name="Joe", model_client=get_model_client()),
)

In [7]:
runtime.start()
await runtime.send_message(
    Message("Joe, tell me a joke."),
    recipient=AgentId(joe, "default"),
    sender=AgentId(cathy, "default"),
)
await runtime.stop_when_idle()

BadRequestError: Error code: 400 - {'error': {'message': "litellm.BadRequestError: LLM Provider NOT provided. Pass in the LLM provider you are trying to call. You passed model=deepseek-coder:6.7b\n Pass model as E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/starcoder',..)` Learn more: https://docs.litellm.ai/docs/providers", 'type': None, 'param': None, 'code': '400'}}