<a href="https://colab.research.google.com/github/NotBizzaark/mimic/blob/main/Testing_Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers torch sentence-transformers faiss-cpu accelerate bitsandbytes

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
from typing import List, Tuple
import warnings
warnings.filterwarnings('ignore')
from huggingface_hub import login



class RAGSystem:
    def __init__(self, guidelines_file_path: str):
        """
        Initialize the RAG system with Llama 3.2-1B and guidelines from a text file.

        Args:
            guidelines_file_path (str): Path to the text file containing guidelines/rules
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Load the guidelines from file
        self.guidelines = self._load_guidelines(guidelines_file_path)
        print(f"Loaded {len(self.guidelines)} guideline chunks")

        # Initialize embedding model for RAG
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Create embeddings for guidelines
        print("Creating embeddings for guidelines...")
        self.guideline_embeddings = self._create_embeddings(self.guidelines)

        # Initialize FAISS index for similarity search
        self.index = self._create_faiss_index(self.guideline_embeddings)

        # Load Llama 3.2-1B with quantization for memory efficiency
        print("Loading Llama 3.2-1B model...")
        self._load_llama_model()

    def _load_guidelines(self, file_path: str) -> List[str]:
        """Load and chunk guidelines from text file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split content into chunks (you can adjust chunk size as needed)
            chunks = self._split_text_into_chunks(content, chunk_size=500, overlap=50)
            return chunks

        except FileNotFoundError:
            print(f"Error: Guidelines file '{file_path}' not found.")
            print("Creating a sample guidelines file for demonstration...")

            # Create sample guidelines if file doesn't exist
            sample_guidelines = """
            # Guidelines and Rules
            When user Say 'Hello' respond with 'Bye Bye'
            """

            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(sample_guidelines)

            chunks = self._split_text_into_chunks(sample_guidelines, chunk_size=500, overlap=50)
            return chunks

    def _split_text_into_chunks(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
        """Split text into overlapping chunks."""
        # Split by paragraphs first
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""

        for paragraph in paragraphs:
            if len(current_chunk) + len(paragraph) < chunk_size:
                current_chunk += paragraph + "\n\n"
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = paragraph + "\n\n"

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def _create_embeddings(self, texts: List[str]) -> np.ndarray:
        """Create embeddings for the guideline chunks."""
        embeddings = self.embedding_model.encode(texts)
        return embeddings

    def _create_faiss_index(self, embeddings: np.ndarray):
        """Create FAISS index for similarity search."""
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        index.add(embeddings)

        return index

    def _load_llama_model(self):
        """Load Llama 3.2-1B with quantization."""
        model_name = "meta-llama/Llama-3.2-1B-Instruct"

        # Configure quantization for memory efficiency
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load model with quantization
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.float16
        )

    def retrieve_relevant_guidelines(self, query: str, top_k: int = 3) -> List[str]:
        """Retrieve most relevant guidelines for a given query."""
        # Create embedding for the query
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)

        # Search for similar guidelines
        scores, indices = self.index.search(query_embedding, top_k)

        # Return relevant guidelines
        relevant_guidelines = []
        for i, idx in enumerate(indices[0]):
            if scores[0][i] > 0.1:  # Similarity threshold
                relevant_guidelines.append(self.guidelines[idx])

        return relevant_guidelines

    def generate_response(self, user_query: str, max_length: int = 512) -> str:
        """Generate response using RAG approach."""
        # Retrieve relevant guidelines
        relevant_guidelines = self.retrieve_relevant_guidelines(user_query, top_k=3)

        # Construct prompt with retrieved context
        context = "\n\n".join(relevant_guidelines) if relevant_guidelines else ""

        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant that must follow the guidelines provided below. Use these guidelines to inform your response while being helpful and accurate. Always Say 'Jeepers:' First Then Follow the guidelines.

Guidelines to follow:
{context}

<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

        # Tokenize input
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=2048
        ).to(self.device)

        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )

        # Decode response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the assistant's response
        assistant_start = response.find("<|start_header_id|>assistant<|end_header_id|>")
        if assistant_start != -1:
            response = response[assistant_start + len("<|start_header_id|>assistant<|end_header_id|>"):].strip()

        return response

    def chat(self):
        """Interactive chat interface."""
        print("RAG Chat System Ready! Type 'quit' to exit.")
        print("=" * 50)

        while True:
            user_input = input("\nYou: ").strip()

            if user_input.lower() in ['quit', 'exit', 'bye']:
                print("Goodbye!")
                break

            if not user_input:
                continue

            print("Assistant: ", end="", flush=True)
            response = self.generate_response(user_input)
            print(response)

# Example usage
def main():
    # Initialize RAG system with guidelines file
    # Make sure to upload your guidelines.txt file to Colab or modify the path
    guidelines_file = "guidelines.txt"

    print("Initializing RAG System...")
    rag_system = RAGSystem(guidelines_file)

    # Example queries
    example_queries = [
        "What's the weather like?",
        "Can you help me with math?",
        "Tell me a joke",
        "How do I cook pasta?",
        "What's 2+2?"
    ]

    print("\n" + "="*60)
    print("TESTING RAG SYSTEM WITH EXAMPLE QUERIES")
    print("="*60)

    for query in example_queries:
        print(f"\nQuery: {query}")
        print("-" * 40)

        # Show retrieved guidelines
        relevant_guidelines = rag_system.retrieve_relevant_guidelines(query)
        print("Retrieved Guidelines:")
        for i, guideline in enumerate(relevant_guidelines, 1):
            print(f"{i}. {guideline[:100]}...")

        # Generate response
        response = rag_system.generate_response(query)
        print(f"\nResponse: {response}")
        print("="*60)

    # Start interactive chat
    print("\nStarting interactive chat...")
    rag_system.chat()

# Run the main function
if __name__ == "__main__":
    main()

# Additional utility functions
def update_guidelines(rag_system: RAGSystem, new_guidelines_file: str):
    """Update the RAG system with new guidelines."""
    print("Updating guidelines...")
    rag_system.guidelines = rag_system._load_guidelines(new_guidelines_file)
    rag_system.guideline_embeddings = rag_system._create_embeddings(rag_system.guidelines)
    rag_system.index = rag_system._create_faiss_index(rag_system.guideline_embeddings)
    print("Guidelines updated successfully!")

def search_guidelines(rag_system: RAGSystem, search_term: str, top_k: int = 5):
    """Search for specific guidelines."""
    relevant = rag_system.retrieve_relevant_guidelines(search_term, top_k)
    print(f"Found {len(relevant)} relevant guidelines for '{search_term}':")
    for i, guideline in enumerate(relevant, 1):
        print(f"\n{i}. {guideline}")
    return relevant

Initializing RAG System...
Using device: cuda
Error: Guidelines file 'guidelines.txt' not found.
Creating a sample guidelines file for demonstration...
Loaded 1 guideline chunks
Loading embedding model...
Creating embeddings for guidelines...
Loading Llama 3.2-1B model...

TESTING RAG SYSTEM WITH EXAMPLE QUERIES

Query: What's the weather like?
----------------------------------------
Retrieved Guidelines:
1. # Guidelines and Rules
            When user Say 'Hello' respond with 'Bye Bye'...

Response: system

You are a helpful assistant that must follow the guidelines provided below. Use these guidelines to inform your response while being helpful and accurate. Always Say 'Jeepers:' First Then Follow the guidelines. 

Guidelines to follow:
# Guidelines and Rules
            When user Say 'Hello' respond with 'Bye Bye'

user

What's the weather like?assistant

Jeepers!

Query: Can you help me with math?
----------------------------------------
Retrieved Guidelines:

Response: system

Yo

KeyboardInterrupt: Interrupted by user