In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Workflow:
every time I stop the runtime:

1) I install the dependencies,

2) restart the session

3) start from model_loader.py up till # Now run the app !python app.py

In [1]:
!pip uninstall -y numpy torch torchao torchvision torchaudio transformers sentence-transformers peft accelerate > /dev/null 2>&1

In [2]:
!pip install numpy==1.24.4 > /dev/null 2>&1
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 > /dev/null 2>&1
!pip install --upgrade transformers > /dev/null 2>&1
!pip install bitsandbytes==0.44.1 > /dev/null 2>&1
!pip install sentence-transformers==2.7.0 > /dev/null 2>&1
!pip install accelerate==0.33.0 > /dev/null 2>&1
!pip install peft==0.15.2 diskcache faiss-cpu -U ddgs > /dev/null 2>&1
!pip install rank_bm25 > /dev/null 2>&1
!pip install nltk > /dev/null 2>&1

In [None]:
import os
os._exit(00)  # Restart the kernel

In [None]:
import pkg_resources

# Re-defining the list since the code environment was reset
packages_to_check = [
    "numpy",
    "torch",
    "torchvision",
    "torchaudio",
    "transformers",
    "sentence-transformers",
    "bitsandbytes",
    "accelerate",
    "peft",
    "diskcache",
    "faiss-cpu",
    "ddgs",
    "rank_bm25"
]

installed_versions = {
    pkg: pkg_resources.get_distribution(pkg).version for pkg in packages_to_check if pkg_resources.working_set.by_key.get(pkg)
}

installed_versions

In [2]:
%%writefile model_loader.py

import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login, snapshot_download
from sentence_transformers import SentenceTransformer


def load_model_and_tokenizer(model_name="soupstick/smollm3-qlora-ft"):
    """Load model with robust tokenizer handling"""
    
    hf_token = os.environ.get("HF_TOKEN", "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")
    login(token=hf_token)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Clear tokenizer cache
    os.system("rm -rf ~/.cache/huggingface/tokenizers")
    
    # Download fresh model files
    model_path = snapshot_download(
        repo_id=model_name,
        token=hf_token,
        ignore_patterns=["*.bin", "*.safetensors"]  # Skip weights, we'll load separately
    )
    
    # Load tokenizer first with error handling
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            token=hf_token,
            trust_remote_code=True
        )
    except Exception as e:
        print(f"Tokenizer loading failed: {e}")
        # Fallback to GPT2 tokenizer as temporary measure
        from transformers import GPT2Tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Model config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        token=hf_token,
        trust_remote_code=True
    )
    
    # Resize token embeddings if using fallback tokenizer
    if tokenizer.vocab_size != model.config.vocab_size:
        model.resize_token_embeddings(len(tokenizer))
    
    # Optimization
    torch.backends.cuda.enable_flash_sdp(True)
    torch.set_float32_matmul_precision("high")
    if device == "cuda":
        model = torch.compile(model)
    
    return model, tokenizer, device

def load_embedder():
    return SentenceTransformer("all-MiniLM-L6-v2")

Writing model_loader.py


In [3]:
%%writefile search_engine.py

from tenacity import retry, stop_after_attempt, wait_exponential
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from ddgs import DDGS
import hashlib
import pickle
from diskcache import Cache
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from vector_db import VectorDB
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

class HybridSearchEngine:
    def __init__(self, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.embedder = SentenceTransformer(embedding_model_name)
        self.vector_db = VectorDB(self.embedder)
        self.bm25 = None
        self.documents = []
        self.raw_results = []
        self.cache = Cache('/kaggle/working/search_cache')
        
    def _hash_query(self, query: str) -> str:
        return hashlib.sha256(pickle.dumps(query)).hexdigest()
    
    def _tokenize(self, text: str) -> List[str]:
        return word_tokenize(text.lower())
    
    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    def web_search(self, query: str, max_results: int = 5, cutoff_days: int = 30) -> List[Dict]:
        cache_key = self._hash_query(query)
        if cache_key in self.cache:
            return self.cache[cache_key]
            
        cutoff_date = datetime.now().date() - timedelta(days=cutoff_days)
        
        try:
            with DDGS() as ddg:
                results = ddg.text(query, max_results=max_results)
                filtered = []
                
                for r in results:
                    result_date = None
                    if "date" in r and r["date"]:
                        try:
                            result_date = datetime.strptime(r["date"], "%Y-%m-%d").date()
                        except ValueError:
                            pass
                    
                    if not result_date or result_date >= cutoff_date:
                        filtered.append({
                            "title": r.get("title", ""),
                            "url": r.get("href", ""), 
                            "snippet": r.get("body", ""),
                            "date": result_date
                        })
                
                self.cache[cache_key] = filtered
                return filtered
                
        except Exception as e:
            print(f"🔍 Search error: {e}")
            return []

    def index_results(self, results: List[Dict]):
        """Index both the snippets and full results"""
        if not results:
            raise ValueError("📭 No documents provided for indexing.")
            
        self.raw_results = results
        snippets = [r["snippet"] for r in results if r.get("snippet")]
        
        if not snippets:
            raise ValueError("📭 No valid snippets found in results.")
            
        self.documents = snippets
        tokenized = [self._tokenize(doc) for doc in snippets]
        self.bm25 = BM25Okapi(tokenized)
        self.vector_db.build_index(snippets)

    def hybrid_search(self, query: str, top_k: int = 5, alpha: float = 0.5) -> Optional[List[Dict]]:
        """Perform hybrid search on already indexed documents"""
        if self.bm25 is None or not self.documents:
            return None

        # BM25 scoring
        tokenized_query = self._tokenize(query)
        bm25_scores = np.array(self.bm25.get_scores(tokenized_query))
        
        # Vector search scoring
        vector_scores, indices = self.vector_db.search(query, top_k=len(self.documents))
        if vector_scores is None:
            return None
            
        # Normalize scores
        bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-9)
        vector_scores = (vector_scores - vector_scores.min()) / (vector_scores.max() - vector_scores.min() + 1e-9)
        
        # Combine scores
        combined_scores = alpha * bm25_scores + (1 - alpha) * vector_scores
        top_indices = np.argsort(combined_scores)[::-1][:top_k]
        
        # Return full results (not just snippets)
        return [self.raw_results[i] for i in top_indices if i < len(self.raw_results)]

Writing search_engine.py


In [4]:
%%writefile vector_db.py

import faiss
from sentence_transformers import SentenceTransformer
from typing import List, Optional
import os
import hashlib
import pickle
from diskcache import Cache

class VectorDB:
    def __init__(self, embedder):
        self.embedder = embedder
        self.index = None
        self.embeddings = None
        self.documents = []

    def build_index(self, documents):
        self.documents = documents
        self.embeddings = self.embedder.encode(documents, convert_to_numpy=True, normalize_embeddings=True)
        dim = self.embeddings.shape[1]

        self.index = faiss.IndexFlatIP(dim)
        self.index.add(self.embeddings)

    def search(self, query, top_k=5):
        query_vec = self.embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
        scores, indices = self.index.search(query_vec, top_k)
        return scores[0], indices[0]

Writing vector_db.py


In [5]:
%%writefile chat_engine.py

from datetime import datetime
from typing import List
import json
import torch
from transformers import TextIteratorStreamer
from threading import Thread
import re

class ChatEngine:
    def __init__(self, model, tokenizer, device, hybrid_searcher=None):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.history = []
        self.hybrid_searcher = hybrid_searcher
        self.log_file = "/kaggle/working/chat_logs.jsonl"

    def generate_response_stream(self, prompt: str):
        streamer = TextIteratorStreamer(
            self.tokenizer, 
            skip_prompt=True, 
            skip_special_tokens=True
        )
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=768
        ).to(self.device)
        generation_kwargs = dict(
            **inputs,
            streamer=streamer,
            max_new_tokens=400,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id,
            num_beams=1,
            use_cache=True
        )
        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
        thread.start()
        return streamer

    def generate_clean_response(self, query: str, context: List[str], raw_output: str) -> str:
        cleaned = raw_output.strip()
        for prefix in ["You are an expert research assistant", "Context:", "Question:", "Answer:"]:
            if prefix.lower() in cleaned.lower():
                cleaned = cleaned.split(prefix, 1)[-1].strip()

        parts = re.split(r'(?i)\n\s*#+\s*Sources|\n\s*(?:\[?\d+\]?\s*)?(https?://|Could you explain quantum computers|r/quantum on Reddit|From computing to quantum mechanics)', cleaned, 1)
        main_content = parts[0].strip()
        sources_raw = ""
        if len(parts) > 1:
            if parts[1] is None or not parts[1].strip():
                sources_raw = parts[2].strip() if len(parts) > 2 else ""
            else:
                sources_raw = parts[1] + (parts[2] if len(parts) > 2 else "") 
                sources_raw = sources_raw.strip()

        final_output_lines = []
        final_output_lines.extend([line.strip() for line in main_content.splitlines() if line.strip()])

        if sources_raw:
            sources_raw = re.sub(r'\n\s*---\s*\n\s*📚\s*Sources\s*\n*', '\n', sources_raw, flags=re.IGNORECASE).strip()
            sources_raw = re.sub(r'\[Comparison of LangChain[^\]]*\]\s*\\n\\n---\\n\\n📚\s*Sources\s*\\n\d*\.\s*', '', sources_raw, flags=re.IGNORECASE).strip()

            final_output_lines.append("\n## Sources")

            extracted_sources = []
            for line in sources_raw.splitlines():
                line = line.strip()
                if not line:
                    continue

                url_match = re.search(r'(https?://[^\s)]+)', line)
                if url_match:
                    url = url_match.group(1)
                    title = line.split(url, 1)[0].strip()
                    title = re.sub(r'\s*Read more\b', '', title, flags=re.IGNORECASE).strip()
                    if not title:
                        title = url
                    extracted_sources.append(f"[{title}]({url})")
                else:
                    line = re.sub(r'^(?:\[\d+\]|\d+\.)\s*', '', line).strip()
                    line = re.sub(r'\s*Read more\b', '', line, flags=re.IGNORECASE).strip()
                    line = re.sub(r'(?i)could you explain quantum computers to a high school student|r/quantum on reddit|from computing to quantum mechanics|based on context', '', line).strip()
                    if line:
                        extracted_sources.append(line)

            for i, source in enumerate(extracted_sources):
                if source.strip():
                    final_output_lines.append(f"{i+1}. {source}")

        return "\n".join(final_output_lines).strip()

    def format_prompt(self, query: str, context: List[str]) -> str:
        context_str = "\n".join(f"- {c.strip()}" for c in context[:3]) if context else "No context provided."

        return f"""
You are an expert research assistant. Your task is to answer the user's question based on the provided context snippets.

**Instructions:**
1.  **Synthesize, do not copy:** Read all snippets and synthesize a comprehensive answer.
2.  **Use Markdown:** Structure your answer with headings, bold text, and bullet points for clarity.
3.  **Be Direct:** Start with a direct answer to the question.
4.  **Professional Tone:** Maintain a sharp, factual, and professional tone.
5.  **Conclusion:** After your main answer, provide a brief concluding paragraph under a "## Conclusion" heading.
6.  **Sources:** If external sources (like URLs or article titles from the context) are relevant to the answer, list them clearly under a "## Sources" heading using a numbered Markdown list. For each source, provide a concise description and the full URL as a Markdown link. Example: `1. [LangChain Documentation](https://www.langchain.com/docs)` Ensure each source is on a new line.

**Context Snippets:**
{context_str}

**Question:** {query}

**Answer (in Markdown format):**
"""

    def search_and_format_prompt(self, query: str):
        results = self.hybrid_searcher.search(query)
        snippets = [r["snippet"] for r in results if r.get("snippet")]

        if snippets:
            self.hybrid_searcher.index(snippets)

        top_snippets = self.hybrid_searcher.search(query, top_k=3)
        prompt = self.format_prompt(query, top_snippets)
        return prompt, top_snippets

    def log_interaction(self, query: str, response: str, context: List[str]):
        log = {
            "timestamp": datetime.now().isoformat(),
            "query": query,
            "response": response,
            "context": [c[:300] for c in context]
        }
        with open(self.log_file, "a") as f:
            f.write(json.dumps(log) + "\n")

Writing chat_engine.py


In [6]:
%%writefile app.py

from model_loader import load_model_and_tokenizer
from search_engine import HybridSearchEngine
from chat_engine import ChatEngine
import gradio as gr
import os

def initialize_system():
    print("⚡ Initializing enhanced system...")
    model, tokenizer, device = load_model_and_tokenizer()
    searcher = HybridSearchEngine()
    return ChatEngine(model, tokenizer, device), searcher

def create_chat_fn(chat_engine, searcher):
    def chat_fn(message, history):
        # First perform web search
        search_results = searcher.web_search(message)
        context_snippets = []
        
        if search_results:
            try:
                # Index the results for hybrid search
                searcher.index_results(search_results)
                
                # Get hybrid results
                hybrid_results = searcher.hybrid_search(message)
                if hybrid_results is not None:
                    context_snippets = [r["snippet"] for r in hybrid_results if r.get("snippet")]
            except ValueError as e:
                print(f"⚠️ Search indexing error: {e}")
                # Fallback to using raw search results
                context_snippets = [r["snippet"] for r in search_results if r.get("snippet")]

        prompt = chat_engine.format_prompt(message, context_snippets)
        streamed_chunks = chat_engine.generate_response_stream(prompt)

        final_response = ""
        for chunk in streamed_chunks:
            final_response += chunk
            yield final_response

        cleaned = chat_engine.generate_clean_response(message, context_snippets, final_response)
        chat_engine.log_interaction(message, cleaned, context_snippets)
        yield cleaned

    return chat_fn

def launch_interface():
    chat_engine, searcher = initialize_system()
    demo = gr.ChatInterface(
        fn=create_chat_fn(chat_engine, searcher),
        title="🤖 SmolLM3-RAG Pro",
        description="AI assistant with premium-quality responses",
        examples=[
            "Explain quantum computing like I'm a high school student",
            "What are Google DeepMind's most exciting current projects?",
            "Break down how RAG systems work technically"
        ]
    )
    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

if __name__ == "__main__":
    launch_interface()

Writing app.py


In [7]:
# Now run the app
!python app.py

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
⚡ Initializing enhanced system...
Fetching 7 files:   0%|                                   | 0/7 [00:00<?, ?it/s]
tokenizer_config.json:   0%|                        | 0.00/50.4k [00:00<?, ?B/s][A


adapter_config.json:   0%|                            | 0.00/855 [00:00<?, ?B/s][A[A[A

adapter_config.json: 100%|█████████████████████| 855/855 [00:00<00:00, 1.56MB/s][A[A




README.md: 100%|███████████████████████████| 5.17k/5.17k [00:00<00:00, 5.32MB/s][A[A[A[A
tokenizer_config.json: 100%|███████████████| 50.4k/50.4k [00:00<00:00, 13.0MB/s]
chat_template.jinja: 100%|█████████████████| 5.60k/5.60k [00:00<00:00, 4.53MB/s]

.gitattributes: 100%|██████████████████████| 1.57k/1.57k [00:00<00:00, 14.4MB/s][A
Fetching 7 files:  14%|███▊                       | 1/7 [00:00<00:01,  5.55it/s]
special_tokens_map.json: 100%|█████████████████| 175/175 [00:00<00:00, 1.97MB/s]