In [2]:
from dotenv import load_dotenv
load_dotenv

<function dotenv.main.load_dotenv(dotenv_path: Union[str, ForwardRef('os.PathLike[str]'), NoneType] = None, stream: Optional[IO[str]] = None, verbose: bool = False, override: bool = False, interpolate: bool = True, encoding: Optional[str] = 'utf-8') -> bool>

In [3]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-3.2-1B"

# Force the model to run on the CPU to avoid MPS memory issues
device = torch.device("cpu")
print("Using CPU instead of MPS to avoid memory issues")

# Load the tokenizer and the model separately
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model with 8-bit quantization using the bitsandbytes library
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32  # Use float32 for CPU
)

Using CPU instead of MPS to avoid memory issues


In [4]:
messages = [
    {
        "role":"system",
        "content": "you are a pirate chatbot who always responds in pirate speak"
    },
    {
        "role":"user",
        "content":"who are you?"
    }
]

In [5]:
custom_template = """
{chat_history}
User: {user_message}
Assistant:
"""

# Set the custom chat template for the tokenizer
tokenizer.chat_template = custom_template

# Define the messages for the conversation
messages = [
    {"role": "user", "content": "What is the weather today?"},
    {"role": "assistant", "content": "It's sunny and warm."}
]

# Apply the chat template to format the messages for the model
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# Print the prompt for debugging purposes
print("Generated Prompt:")
print(prompt)

Generated Prompt:

{chat_history}
User: {user_message}
Assistant:


In [6]:
prompt

'\n{chat_history}\nUser: {user_message}\nAssistant:'

In [8]:
# Use eos_token_id for the end of the generation
terminator = tokenizer.eos_token_id

# Create the generation pipeline manually without passing load_in_8bit
generator = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device  # Make sure this runs on CPU
)

In [9]:
outputs = generator(
    prompt,
    max_new_tokens = 128,
    eos_token_id = terminator,
    do_sample = True,
    temperature = 0.6,
    top_p = 0.9,
)
output_text = outputs[0]["generated_text"]
print("Generated Output:")
print(output_text)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Generated Output:

{chat_history}
User: {user_message}
Assistant: {assistant_message}
Assistant: {assistant_message}

User: {user_message}
Assistant: {assistant_message}
Assistant: {assistant_message}

User: {user_message}
Assistant: {assistant_message}
Assistant: {assistant_message}

User: {user_message}
Assistant: {assistant_message}
Assistant: {assistant_message}

User: {user_message}
Assistant: {assistant_message}
Assistant: {assistant_message}

User: {user_message}
Assistant: {assistant_message}
Assistant: {assistant_message}

User: {user_message}
Assistant: {assistant_message}
Assistant: {assistant_message}

User: {user_message}
Assistant: {assistant


In [10]:
import gradio as gr

In [14]:
def chat_function(message,history,system_prompt, max_new_tokens, temperature):
    messages = [{"role":"system","content":system_prompt},
                {"role":"user","content":message}]
    prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True)

    terminator = tokenizer.eos_token_id

    outputs = generator(
    prompt,
    max_new_tokens = 128,
    eos_token_id = terminator,
    do_sample = True,
    temperature = temperature + 0.1,
    top_p = 0.9,)
    return outputs[0]["generated_text"][len(prompt):]

In [15]:
gr.ChatInterface(
    chat_function,
     textbox=gr.Textbox(placeholder="Enter message here",container=False, scale = 7),
                 chatbot=gr.Chatbot(height=400),
                             additional_inputs=[
                                 gr.Textbox("you are helpful AI", label="System Prompt"),
                                 gr.Slider (500,4000, label="Max New Tokens"),
                                 gr.Slider(0,1, label="Temperature")
                             ]            
                             ).launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
