In [1]:
# Mistral 7B Chatbot Notebook (Jupytext / .py with cells)
# %%
"""
Notebook: Create a chatbot with Mistral 7B

This file is written as a Python script with Jupyter cell markers ("# %%") so you can
- open it in Jupyter/VSCode
- or convert to a real .ipynb using jupytext

It provides three main options:
1) Run locally with `transformers` (best if you have a GPU and enough VRAM or use 4-bit quantization)
2) Use Hugging Face Inference API (fast, requires an API token)
3) Fallback simple HTTP API example (for other hosted providers)

Read the first cells before running.
"""
# %%
# INSTALL REQUIRED PACKAGES
# Run this cell once in your environment. Uncomment and run in Jupyter.
# Note: installing bitsandbytes requires a CUDA-enabled GPU and matching CUDA/toolchain.

# !pip install --upgrade pip
# !pip install transformers accelerate einops sentencepiece huggingface_hub
# !pip install bitsandbytes  # optional: for 8/4-bit quantization (CUDA required)
# !pip install torch  # ensure torch is installed (GPU build recommended)

# %%
# CONFIGURATION: choose mode and model
MODE = "transformers_local"  # options: "transformers_local", "hf_inference", "http_api"
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"  # recommended instruct variant
DEVICE = "auto"  # or 0 for first GPU

# If using Hugging Face Inference API, set HF_TOKEN environment variable or put token here (not recommended)
HF_INFERENCE_TOKEN = None  # or "hf_..."

# %%
# SIMPLE CHAT INTERFACE (shared by modes)
from typing import List, Dict

class ChatHistory:
    def __init__(self, system_prompt: str = "You are a helpful assistant."):
        self.system = system_prompt
        self.messages = []  # list of (role, content)

    def add_user(self, text: str):
        self.messages.append({"role": "user", "content": text})

    def add_assistant(self, text: str):
        self.messages.append({"role": "assistant", "content": text})

    def as_messages(self):
        # returns a list formatted for instruction-style models
        msgs = [{"role": "system", "content": self.system}] + self.messages
        return msgs

    def short_history(self, max_turns: int = 6):
        # keep last N turns (pairs)
        if len(self.messages) <= max_turns*2:
            return self.as_messages()
        # keep system + last max_turns*2 items
        return [{"role": "system", "content": self.system}] + self.messages[-max_turns*2:]

# %%
# OPTION 1: Run locally with Hugging Face `transformers` (recommended if you have GPU & CUDA)
if MODE == "transformers_local":
    try:
        import torch
        from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
        from transformers import BitsAndBytesConfig
        import os

        print("Using transformers local mode")

        # If you have limited VRAM, you can enable 4-bit quantization (needs bitsandbytes)
        use_4bit = True

        if use_4bit:
            # Adjust these params based on your setup
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
            )
            print("Loading model with 4-bit quantization (requires bitsandbytes)")

            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                device_map=DEVICE,
                quantization_config=bnb_config,
                trust_remote_code=True,
            )

        else:
            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                device_map=DEVICE,
                torch_dtype=torch.float16,
                trust_remote_code=True,
            )

        # Create a simple generate helper using model.generate
        def generate_from_messages(messages: List[Dict], max_new_tokens: int = 256, temperature: float = 0.2):
            # For instruct-style models we concatenate system + messages into a single prompt
            prompt = ""
            # Simple formatting: include roles
            for m in messages:
                role = m.get("role", "user")
                content = m.get("content", "")
                if role == "system":
                    prompt += f"<|system|>" + content + "\n"
                elif role == "user":
                    prompt += f"<|user|>" + content + "\n"
                else:
                    prompt += f"<|assistant|>" + content + "\n"

            input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
            outputs = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                pad_token_id=tokenizer.eos_token_id,
            )
            decoded = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
            return decoded

        # Example interactive loop (run in a cell)
        ch = ChatHistory(system_prompt="You are a helpful assistant.")
        print("Chatbot ready (local). Use ch.add_user('...') and then call generate_from_messages(ch.as_messages())")

    except Exception as e:
        print("Error setting up local transformers mode:", e)
        print("Consider using HF Inference API or http_api mode instead.")

# %%
# OPTION 2: Hugging Face Inference API (no local GPU required)
if MODE == "hf_inference":
    import os, requests, json
    HF_TOKEN = HF_INFERENCE_TOKEN or os.environ.get("HF_TOKEN")
    if not HF_TOKEN:
        raise RuntimeError("Set HF_INFERENCE_TOKEN variable or HF_TOKEN env var to use Hugging Face Inference API")

    HF_MODEL = MODEL_ID
    HF_API_URL = f"https://api-inference.huggingface.co/models/{HF_MODEL}"

    headers = {"Authorization": f"Bearer {HF_TOKEN}"}

    def hf_generate(messages: List[Dict], max_tokens: int = 256, temperature: float = 0.2):
        # For the inference API, we pass the messages in a single string prompt.
        prompt = "".join([f"{m['role']}: {m['content']}\n" for m in messages])
        payload = {
            "inputs": prompt,
            "parameters": {"max_new_tokens": max_tokens, "temperature": temperature},
        }
        resp = requests.post(HF_API_URL, headers=headers, json=payload, timeout=120)
        resp.raise_for_status()
        data = resp.json()
        # API returns generated text in different shapes depending on model; try to extract text
        if isinstance(data, list) and "generated_text" in data[0]:
            return data[0]["generated_text"]
        elif isinstance(data, dict) and "generated_text" in data:
            return data["generated_text"]
        else:
            # fallback: stringify
            return str(data)

    ch = ChatHistory()
    print("Hugging Face Inference API helper ready. Use hf_generate(ch.as_messages())")

# %%
# OPTION 3: Generic HTTP API (example template for other hosted providers e.g. Clarifai, Replicate)
if MODE == "http_api":
    import requests, os
    API_URL = "https://api.example.com/generate"  # replace with provider
    API_KEY = os.environ.get("MY_API_KEY")

    def http_generate(messages: List[Dict], max_tokens: int = 256):
        prompt = "\n".join([f"{m['role']}: {m['content']}" for m in messages])
        resp = requests.post(API_URL, json={"prompt": prompt, "max_tokens": max_tokens}, headers={"Authorization": f"Bearer {API_KEY}"})
        return resp.json()

    print("HTTP API template ready. Replace API_URL and API_KEY with your provider settings.")

# %%
# USAGE EXAMPLE (universal)
# After you run the appropriate mode cell, run the following to chat interactively in Jupyter:

if __name__ == "__main__":
    print("Example interactive session. This block runs only if you execute this script directly.")
    # choose which generate function to call based on MODE
    if MODE == "transformers_local":
        gen = generate_from_messages
    elif MODE == "hf_inference":
        gen = hf_generate
    else:
        gen = lambda msgs, **k: "Replace with your provider generate function"

    ch = ChatHistory(system_prompt="You are a helpful assistant.")
    while True:
        u = input("You: ")
        if u.strip().lower() in ["quit", "exit"]:
            break
        ch.add_user(u)
        reply = gen(ch.short_history())
        print("Assistant:", reply)
        ch.add_assistant(reply)


Error setting up local transformers mode: No module named 'torch'
Consider using HF Inference API or http_api mode instead.
Example interactive session. This block runs only if you execute this script directly.


NameError: name 'generate_from_messages' is not defined