In [4]:
# 1. Install system build tools (needed for the GPU compilation)
!apt-get update && apt-get install -y build-essential cmake

# 1. Install the Pre-Compiled Binary (Installs in 5 seconds)
!pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121

# 2. Install the rest
!pip install -q gradio chromadb sentence-transformers huggingface_hub

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Waiting for headers] [C                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Waiting for headers] [C                                                                               Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent

In [2]:
import gradio as gr
import json
import uuid
from datetime import datetime
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import chromadb
from sentence_transformers import SentenceTransformer

print("üöÄ Booting Updated NeuroMemory System...")

# 1. LOAD MODEL
model_name = "MaziyarPanahi/Llama-3-8B-Instruct-v0.1-GGUF"
model_file = "Llama-3-8B-Instruct-v0.1.Q4_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name, filename=model_file)

llm = Llama(
    model_path=model_path,
    n_gpu_layers=-1,
    n_ctx=4096,
    verbose=False
)

# 2. MEMORY ENGINE
chroma_client = chromadb.Client()
# We use a fresh collection or get the existing one
try:
    collection = chroma_client.get_collection(name="neuromemory_v2")
except:
    collection = chroma_client.create_collection(name="neuromemory_v2")

embedder = SentenceTransformer('all-MiniLM-L6-v2')

class MemorySystem:
    def __init__(self):
        self.turn_count = 0

    def inject_dynamic_memory(self, text):
        """Allows injecting ANY new memory with a unique ID."""
        if not text or text.strip() == "":
            return "‚ö†Ô∏è Please enter text to remember."

        unique_id = f"mem_{uuid.uuid4().hex[:8]}"
        vec = embedder.encode(text).tolist()

        collection.add(
            ids=[unique_id],
            documents=[text],
            embeddings=[vec],
            metadatas=[{"timestamp": str(datetime.now()), "type": "manual"}]
        )
        return f"‚úÖ SAVED TO LONG-TERM MEMORY: '{text}'"

    def retrieve(self, query):
        """Searches for the top 2 most relevant memories."""
        vec = embedder.encode(query).tolist()
        res = collection.query(query_embeddings=[vec], n_results=2)

        if res['documents'] and len(res['documents'][0]) > 0:
            # Combine retrieved memories into one string
            return " ".join(res['documents'][0])
        return None

memory_sys = MemorySystem()

# 3. CORE CHAT LOGIC
def chat(message, history):
    # A. Search Long-Term Memory (RAG)
    past_context = memory_sys.retrieve(message)

    # B. Build System Prompt
    system_prompt = "You are a helpful, concise AI assistant."
    if past_context:
        system_prompt += f"\n\nRelevant Context from your memory: {past_context}"

    # C. Build Message History (Short-Term Memory)
    # This loop ensures the AI remembers the current conversation flow.
    messages = [{"role": "system", "content": system_prompt}]
    for user_part, assistant_part in history:
        messages.append({"role": "user", "content": user_part})
        messages.append({"role": "assistant", "content": assistant_part})

    # Add the current user message
    messages.append({"role": "user", "content": message})

    # D. Generate Response
    res = llm.create_chat_completion(
        messages=messages,
        max_tokens=256,
        temperature=0.7
    )

    response_text = res['choices'][0]['message']['content']

    # E. Display which memory was triggered (Optional for UI)
    if past_context:
        response_text += f"\n\n_üß† Memory Recall: {past_context[:50]}..._"

    return response_text

# 4. GRADIO UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# üß† NeuroMemory V2")
    gr.Markdown("Short-term memory via history + Long-term memory via ChromaDB.")

    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.ChatInterface(fn=chat)

        with gr.Column(scale=1):
            gr.Markdown("### üõ†Ô∏è Memory Injection")
            mem_input = gr.Textbox(label="New Fact to Remember", placeholder="e.g. My name is Vaibhav")
            inject_btn = gr.Button("Inject into Brain", variant="primary")
            status_box = gr.Textbox(label="System Status", interactive=False)

            inject_btn.click(
                fn=memory_sys.inject_dynamic_memory,
                inputs=mem_input,
                outputs=status_box
            )

demo.launch(share=True)

üöÄ Booting Updated NeuroMemory System...


llama_context: n_ctx_per_seq (4096) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9b1062f2781bd7a817.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


