In [1]:
!cd drive/ && ls

MyDrive


In [None]:
from huggingface_hub import notebook_login

notebook_login("")



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# !unzip dataset.zip

In [4]:
!pip install datasets llama-cpp-python faiss-cpu rouge_score bitsandbytes
!pip install -U bitsandbytes
!pip install --upgrade transformers
!pip install mpi4py
!pip install -U deepseed

[31mERROR: Could not find a version that satisfies the requirement deepseed (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for deepseed[0m[31m
[0m

In [5]:
import os
import re
from pathlib import Path
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    AutoModel,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, PeftModel
import wandb
import subprocess
import shutil
from typing import List, Optional, Dict, Any
import time
from llama_cpp import Llama
import faiss
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import gc

nltk.download("punkt_tab")
# Set this environment variable to avoid memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear cache at startup
gc.collect()
torch.cuda.empty_cache()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
# Configuration
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  # or "Qwen/Qwen2.5-3B"
DATA_DIR = "./data"
OUTPUT_DIR = "./fine_tuned_model"
MAX_LENGTH = 2048
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_EPOCHS = 10
GRADIENT_ACCUMULATION_STEPS = 4

# LoRA configuration
LORA_CONFIG = LoraConfig(
    r=32,  # rank
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [7]:
class DocumentProcessor:
    def __init__(self, documents_dir):
        self.documents_dir = Path(documents_dir)
        self.documents = {}
        self.load_documents()

    def load_documents(self):
        """Load all markdown documents from the specified directory."""
        for file_path in self.documents_dir.glob("*.md"):
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                # Extract title from filename or first heading in the document
                title = file_path.stem
                self.documents[title] = content
        print(f"Loaded {len(self.documents)} documents.")

    def chunk_documents(self, chunk_size=1500, overlap=150):
        """Chunk documents into smaller pieces with overlap."""
        chunked_docs = []

        for title, content in self.documents.items():
            # Remove markdown formatting for cleaner text
            text = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
            text = re.sub(r"#+ ", "", text)
            text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)

            # Split into sentences (rough approximation)
            sentences = re.split(r"(?<=[.!?])\s+", text)

            chunks = []
            current_chunk = []
            current_length = 0

            for sentence in sentences:
                sentence_length = len(sentence.split())
                if current_length + sentence_length > chunk_size:
                    if current_chunk:
                        chunks.append(" ".join(current_chunk))

                    # Start new chunk with overlap
                    overlap_tokens = (
                        current_chunk[-overlap:]
                        if overlap < len(current_chunk)
                        else current_chunk
                    )
                    current_chunk = overlap_tokens + [sentence]
                    current_length = len(current_chunk)
                else:
                    current_chunk.append(sentence)
                    current_length += sentence_length

            if current_chunk:
                chunks.append(" ".join(current_chunk))

            for i, chunk in enumerate(chunks):
                chunked_docs.append(
                    {
                        "title": title,
                        "chunk_id": i,
                        "text": chunk.strip(),
                    }
                )

        return chunked_docs


class QAGenerator:
    def __init__(self, model_name="Qwen/Qwen1.5-7B-Chat"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.float16, device_map="auto"
        )

    def generate_qa_pairs(self, chunks, num_questions_per_chunk=3):
        qa_pairs = []

        for chunk in chunks:
            # Create a prompt for the model to generate questions and answers
            prompt = f"""Given the following text from an AI research paper, generate {num_questions_per_chunk} relevant question-answer pairs.
The questions should be detailed and test deep understanding of the concepts.
Make sure the answer is comprehensive and accurate based on the text.

TEXT:
{chunk["text"]}

FORMAT:
Q1: [Question 1]
A1: [Answer 1]
Q2: [Question 2]
A2: [Answer 2]
Q3: [Question 3]
A3: [Answer 3]
"""
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

            outputs = self.model.generate(
                **inputs,
                max_new_tokens=1024,  # Generate up to 1024 new tokens
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.2,
                do_sample=True,
            )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract questions and answers
            pairs = self.extract_qa_pairs(response)

            for q, a in pairs:
                qa_pairs.append(
                    {
                        "title": chunk["title"],
                        "chunk_id": chunk["chunk_id"],
                        "context": chunk["text"],
                        "question": q,
                        "answer": a,
                    }
                )

        return qa_pairs

    def extract_qa_pairs(self, text):
        # Extract Q/A pairs using regex
        pattern = r"Q\d+:\s*(.*?)\s*\nA\d+:\s*(.*?)(?=\nQ\d+:|$)"
        matches = re.findall(pattern, text, re.DOTALL)

        # Clean up the extracted pairs
        pairs = []
        for question, answer in matches:
            question = question.strip()
            answer = answer.strip()
            if question and answer:  # Ensure both question and answer are non-empty
                pairs.append((question, answer))

        return pairs


def create_synthetic_data(documents_dir="./dataset/q3_dataset", output_dir="./data"):
    # Process documents
    processor = DocumentProcessor(documents_dir)
    chunks = processor.chunk_documents()

    # Generate QA pairs
    generator = QAGenerator()
    qa_pairs = generator.generate_qa_pairs(chunks)

    # Create dataset
    dataset = Dataset.from_list(qa_pairs)

    # Create train/validation/test splits
    splits = dataset.train_test_split(test_size=0.2, seed=42)
    train_valid = splits["train"]
    test = splits["test"]

    # Further split train into train and validation
    splits = train_valid.train_test_split(
        test_size=0.25, seed=42
    )  # 0.25 * 0.8 = 0.2 of original data
    train = splits["train"]
    validation = splits["test"]

    # Save datasets
    os.makedirs(output_dir, exist_ok=True)
    train.to_json(os.path.join(output_dir, "train.json"))
    validation.to_json(os.path.join(output_dir, "validation.json"))
    test.to_json(os.path.join(output_dir, "test.json"))

    print(
        f"Dataset created with {len(train)} training, {len(validation)} validation, and {len(test)} test examples."
    )
    return train, validation, test

In [8]:
class QAFineTuner:
    def __init__(self, model_name, data_dir, output_dir):
        self.model_name = model_name
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.tokenizer = None
        self.model = None
        self.train_dataset = None
        self.validation_dataset = None

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

    def load_data(self):
        """Load and preprocess the datasets."""
        train_path = os.path.join(self.data_dir, "train.json")
        validation_path = os.path.join(self.data_dir, "validation.json")

        self.train_dataset = load_dataset("json", data_files=train_path)["train"]
        self.validation_dataset = load_dataset("json", data_files=validation_path)[
            "train"
        ]

        print(
            f"Loaded {len(self.train_dataset)} training examples and {len(self.validation_dataset)} validation examples."
        )

    def prepare_model(self):
        """Load and prepare the model with LoRA."""
        # Clear memory before model loading
        gc.collect()
        torch.cuda.empty_cache()

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        try:
            from transformers import BitsAndBytesConfig
            import bitsandbytes

            print(f"Using bitsandbytes version: {bitsandbytes.__version__}")

            # Configure quantization for memory efficiency
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
            )

            # Load model with quantization
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16,
                device_map="auto",
                quantization_config=bnb_config,
                use_cache=False,  # Important for training
            )

            # Apply LoRA adapter
            self.model = get_peft_model(self.model, LORA_CONFIG)

            print("Successfully loaded model with 4-bit quantization and LoRA adapters")

        except (ImportError, ModuleNotFoundError) as e:
            print(f"Warning: Could not use quantization: {e}")
            print("Falling back to CPU loading with offloading")

            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16,
                device_map="cpu",
                low_cpu_mem_usage=True,
            )

            # Apply LoRA adapter
            self.model = get_peft_model(self.model, LORA_CONFIG)

            # Move to GPU selectively if possible
            try:
                self.model.to_bettertransformer()
            except:
                print("Could not convert to BetterTransformer")

        # Print trainable parameters info
        self.model.print_trainable_parameters()

    def format_instruction(self, example):
        """Format the input as an instruction."""
        context = example["context"]
        question = example["question"]
        answer = example["answer"]

        instruction = f"""### System:
You are an AI assistant that specializes in answering questions about AI research papers.
Your responses should be comprehensive, accurate, and based on the provided context.

### Human:
I have a question about an AI research paper.

Context: {context}

Question: {question}

### Assistant:
{answer}
"""
        return instruction

    def tokenize_function(self, examples):
        """Tokenize and format the examples."""
        instructions = []

        for i in range(len(examples["context"])):
            example = {
                "context": examples["context"][i],
                "question": examples["question"][i],
                "answer": examples["answer"][i],
            }
            instructions.append(self.format_instruction(example))

        tokenized = self.tokenizer(
            instructions,
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )

        tokenized["labels"] = tokenized["input_ids"].clone()
        return tokenized

    def prepare_datasets(self):
        """Prepare tokenized datasets for training."""
        tokenize_batch_size = 8

        self.train_dataset = self.train_dataset.map(
            self.tokenize_function,
            batched=True,
            batch_size=tokenize_batch_size,
            remove_columns=self.train_dataset.column_names,
        )

        self.validation_dataset = self.validation_dataset.map(
            self.tokenize_function,
            batched=True,
            batch_size=tokenize_batch_size,
            remove_columns=self.validation_dataset.column_names,
        )

        print(f"Tokenized datasets: {self.train_dataset}, {self.validation_dataset}")

    def train(self):
        """Train the model."""
        # Clear CUDA cache
        gc.collect()
        torch.cuda.empty_cache()

        # Initialize wandb for tracking
        wandb.init(project="qwen-ai-research-qa", name="qwen-2.5-3b-qlora")

        # Make sure no DeepSpeed configurations are active
        for key in list(os.environ.keys()):
            if "DEEPSPEED" in key or "DS_" in key:
                del os.environ[key]

        # Configure training arguments with NO DeepSpeed
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=NUM_EPOCHS,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=16,
            learning_rate=LEARNING_RATE,
            weight_decay=0.01,
            warmup_ratio=0.1,
            logging_dir=os.path.join(self.output_dir, "logs"),
            logging_steps=50,
            eval_steps=1000,
            save_steps=1000,
            evaluation_strategy="steps",
            save_strategy="steps",
            save_total_limit=2,
            load_best_model_at_end=True,
            report_to="wandb",
            # Switch to standard FP32 precision
            bf16=False,
            fp16=False,
            # DeepSpeed settings - force disable
            deepspeed=None,
            local_rank=-1,
            ddp_backend=None,  # Don't use any distributed backend
        )

        # Create trainer with standard optimizer
        from transformers import AdamW

        optimizer = AdamW(self.model.parameters(), lr=LEARNING_RATE)

        # Create data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        # Create trainer with explicit optimizer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.validation_dataset,
            data_collator=data_collator,
            optimizers=(optimizer, None),  # Use our optimizer, no scheduler
        )

        # Train the model
        trainer.train()

        # Save the final model
        self.model.save_pretrained(os.path.join(self.output_dir, "final"))
        self.tokenizer.save_pretrained(os.path.join(self.output_dir, "final"))

        print("Training complete!")

In [9]:
class ModelQuantizer:
    def __init__(
        self,
        model_path="./fine_tuned_model/final",
        base_model="Qwen/Qwen2.5-3B-Instruct",
        output_dir="./quantized_model",
    ):
        self.model_path = model_path
        self.base_model = base_model
        self.output_dir = output_dir
        self.quantized_model_path = os.path.join(output_dir, "model.gguf")

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

    def load_and_merge_model(self):
        """Load the LoRA model and merge with the base model."""
        print("Loading base model...")

        # Load base model
        base_model = AutoModelForCausalLM.from_pretrained(
            self.base_model,
            torch_dtype=torch.float16,
            device_map="auto",
        )

        # Load LoRA weights
        print("Loading and merging LoRA weights...")
        model = PeftModel.from_pretrained(base_model, self.model_path)

        # Merge LoRA weights with base model
        model = model.merge_and_unload()

        # Save merged model and tokenizer
        merged_model_path = os.path.join(self.output_dir, "merged")
        os.makedirs(merged_model_path, exist_ok=True)

        print(f"Saving merged model to {merged_model_path}...")
        model.save_pretrained(merged_model_path)

        # Save tokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.base_model)
        tokenizer.save_pretrained(merged_model_path)

        print("Model and tokenizer saved successfully.")
        return merged_model_path

    def convert_to_gguf(self, merged_model_path):
        """Convert the merged model to GGUF format with 4-bit quantization."""
        print("Converting to GGUF format with 4-bit quantization...")

        # Check for existing GGUF model
        if os.path.exists(self.quantized_model_path):
            print(f"GGUF model already exists at {self.quantized_model_path}")
            user_input = input("Do you want to rebuild it? (y/n): ").lower()
            if user_input != "y":
                print("Using existing GGUF model.")
                return self.quantized_model_path

        # Clone llama.cpp repository if needed
        if not os.path.exists("llama.cpp"):
            try:
                print("Cloning llama.cpp repository...")
                subprocess.run(
                    ["git", "clone", "https://github.com/ggerganov/llama.cpp.git"],
                    check=True,
                )
            except subprocess.CalledProcessError:
                print("Error cloning llama.cpp repository.")
                raise RuntimeError("Failed to clone llama.cpp repository")

        # Build llama.cpp with better error handling
        try:
            print("Building llama.cpp with CMake (this may take a few minutes)...")
            os.makedirs("llama.cpp/build", exist_ok=True)

            # Configure with CMake
            subprocess.run(
                ["cmake", "-S", "llama.cpp", "-B", "llama.cpp/build"], check=True
            )

            # Build with CMake
            subprocess.run(
                ["cmake", "--build", "llama.cpp/build", "--parallel"], check=True
            )

            print("llama.cpp built successfully with CMake")

            # Use convert_hf_to_gguf.py with verbose output to see what's happening
            convert_script = "llama.cpp/convert_hf_to_gguf.py"

            if not os.path.exists(convert_script):
                print(f"ERROR: {convert_script} not found!")
                print("Please verify your llama.cpp installation.")
                raise RuntimeError(f"Conversion script not found: {convert_script}")

            print("\nRunning conversion script with enhanced debugging...")

            # Try conversion with detailed error output
            try:
                result = subprocess.run(
                    [
                        "python3",
                        convert_script,
                        merged_model_path,
                        "--outfile",
                        self.quantized_model_path,
                        "--outtype",
                        "q4_0",
                        "--verbose",  # Add verbose output
                    ],
                    check=True,
                    capture_output=True,
                    text=True,
                )
                print(result.stdout)

            except subprocess.CalledProcessError as e:
                print("\n===== Conversion Error Details =====")
                print(f"Exit code: {e.returncode}")
                print(f"STDOUT: {e.stdout}")
                print(f"STDERR: {e.stderr}")
                print("===================================\n")

                print(
                    "Trying alternate conversion approach with arch-specific parameters..."
                )
                try:
                    # Try with explicit model architecture parameters
                    result = subprocess.run(
                        [
                            "python3",
                            convert_script,
                            merged_model_path,
                            "--outfile",
                            self.quantized_model_path,
                            "--outtype",
                            "q4_0",
                            "--model-type",
                            "llama",  # Try forcing llama architecture
                            "--ctx",
                            "4096",
                        ],
                        check=True,
                        capture_output=True,
                        text=True,
                    )
                    print(result.stdout)

                except subprocess.CalledProcessError as e2:
                    print("Alternate approach also failed")
                    print(f"STDOUT: {e2.stdout}")
                    print(f"STDERR: {e2.stderr}")
                    raise RuntimeError("All conversion methods failed")

        except Exception as e:
            print(f"Error during build or conversion process: {e}")
            raise RuntimeError("Failed to convert model to GGUF format")

        print(
            f"Model successfully converted to GGUF format: {self.quantized_model_path}"
        )

        # Copy tokenizer files to output directory
        tokenizer_files = ["tokenizer_config.json", "tokenizer.json"]
        for file in tokenizer_files:
            src_path = os.path.join(merged_model_path, file)
            if os.path.exists(src_path):
                dst_path = os.path.join(self.output_dir, file)
                shutil.copy2(src_path, dst_path)

        return self.quantized_model_path

    def quantize(self):
        """Perform the complete quantization process."""
        merged_model_path = self.load_and_merge_model()
        gguf_path = self.convert_to_gguf(merged_model_path)
        return gguf_path

In [10]:
class Evaluator:
    def __init__(self, model_path="./quantized_model/model.gguf", data_dir="./data"):
        self.data_dir = data_dir
        self.inference = ModelInference(model_path=model_path, use_rag=True)
        self.inference_no_rag = ModelInference(model_path=model_path, use_rag=False)

        # Download necessary NLTK data
        try:
            nltk.data.find("punkt")
        except LookupError:
            nltk.download("punkt")

        # Initialize ROUGE scorer
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ["rouge1", "rouge2", "rougeL"], use_stemmer=True
        )
        self.smooth = SmoothingFunction().method1

    def load_test_data(self):
        """Load the test dataset."""
        test_path = os.path.join(self.data_dir, "test.json")
        return load_dataset("json", data_files=test_path)["train"]

    def calculate_metrics(self, reference, candidate):
        """Calculate BLEU and ROUGE scores."""
        # ROUGE scores
        rouge_scores = self.rouge_scorer.score(reference, candidate)

        # BLEU score
        reference_tokens = nltk.word_tokenize(reference.lower())
        candidate_tokens = nltk.word_tokenize(candidate.lower())
        bleu_score = sentence_bleu(
            [reference_tokens], candidate_tokens, smoothing_function=self.smooth
        )

        return {
            "bleu": bleu_score,
            "rouge1": rouge_scores["rouge1"].fmeasure,
            "rouge2": rouge_scores["rouge2"].fmeasure,
            "rougeL": rouge_scores["rougeL"].fmeasure,
        }

    def evaluate(self, sample_size=None):
        """Evaluate the model on the test set."""
        test_data = self.load_test_data()

        # Limit evaluation to sample_size if specified
        if sample_size is not None:
            test_data = test_data.select(range(min(sample_size, len(test_data))))

        results_with_rag = []
        results_without_rag = []

        print(f"Evaluating on {len(test_data)} test examples...")

        for i, example in enumerate(test_data):
            print(f"Processing example {i + 1}/{len(test_data)}...")

            question = example["question"]
            reference_answer = example["answer"]

            # Generate answers with and without RAG
            answer_with_rag = self.inference.generate_answer(question)
            answer_without_rag = self.inference_no_rag.generate_answer(question)

            # Calculate metrics
            metrics_with_rag = self.calculate_metrics(reference_answer, answer_with_rag)
            metrics_without_rag = self.calculate_metrics(
                reference_answer, answer_without_rag
            )

            # Store results
            results_with_rag.append(
                {
                    "question": question,
                    "reference": reference_answer,
                    "prediction": answer_with_rag,
                    **metrics_with_rag,
                }
            )

            results_without_rag.append(
                {
                    "question": question,
                    "reference": reference_answer,
                    "prediction": answer_without_rag,
                    **metrics_without_rag,
                }
            )

        # Calculate average metrics
        avg_metrics_with_rag = {
            "bleu": sum(r["bleu"] for r in results_with_rag) / len(results_with_rag),
            "rouge1": sum(r["rouge1"] for r in results_with_rag)
            / len(results_with_rag),
            "rouge2": sum(r["rouge2"] for r in results_with_rag)
            / len(results_with_rag),
            "rougeL": sum(r["rougeL"] for r in results_with_rag)
            / len(results_with_rag),
        }

        avg_metrics_without_rag = {
            "bleu": sum(r["bleu"] for r in results_without_rag)
            / len(results_without_rag),
            "rouge1": sum(r["rouge1"] for r in results_without_rag)
            / len(results_without_rag),
            "rouge2": sum(r["rouge2"] for r in results_without_rag)
            / len(results_without_rag),
            "rougeL": sum(r["rougeL"] for r in results_without_rag)
            / len(results_without_rag),
        }

        print("\nEvaluation Results:")
        print("\nWith RAG:")
        for metric, value in avg_metrics_with_rag.items():
            print(f"{metric}: {value:.4f}")

        print("\nWithout RAG:")
        for metric, value in avg_metrics_without_rag.items():
            print(f"{metric}: {value:.4f}")

        return {
            "with_rag": {
                "detailed_results": results_with_rag,
                "average_metrics": avg_metrics_with_rag,
            },
            "without_rag": {
                "detailed_results": results_without_rag,
                "average_metrics": avg_metrics_without_rag,
            },
        }

In [11]:
class EmbeddingModel:
    def __init__(self, model_name="BAAI/bge-small-en-v1.5"):
        self.model_name = model_name
        # Use CPU for embeddings to save GPU memory
        self.device = torch.device("cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        # Get embedding dimension from the model
        self.embedding_dim = self.model.config.hidden_size

    def get_embedding_dim(self):
        """Return the embedding dimension of the model."""
        return self.embedding_dim

    def get_embeddings(self, texts: List[str], batch_size=16) -> np.ndarray:
        """Generate embeddings for a list of texts."""
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i : i + batch_size]

            # Tokenize
            inputs = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt",
            ).to(self.device)

            # Generate embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()

            embeddings.append(batch_embeddings)

        return np.vstack(embeddings)


class VectorStore:
    def __init__(self, embedding_dim=768):
        self.index = faiss.IndexFlatL2(
            embedding_dim
        )  # L2 distance for similarity search
        self.texts = []

    def add_texts(self, texts: List[str], embeddings: np.ndarray):
        """Add texts and their embeddings to the vector store."""
        # Add embeddings to index
        self.index.add(embeddings)
        # Store original texts
        self.texts.extend(texts)

    def search(self, query_embedding: np.ndarray, k: int = 5) -> List[Dict[str, Any]]:
        """Search for most similar texts given a query embedding."""
        # Reshape query embedding
        query_embedding = query_embedding.reshape(1, -1)

        # Search in the index
        distances, indices = self.index.search(query_embedding, k)

        # Build results
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.texts) and idx >= 0:
                results.append(
                    {
                        "text": self.texts[idx],
                        "score": float(distances[0][i]),
                        "id": int(idx),
                    }
                )

        return results


class RAGSystem:
    def __init__(self, data_dir="./data"):
        self.embedding_model = EmbeddingModel()
        # Use the actual embedding dimension from the model
        self.vector_store = VectorStore(
            embedding_dim=self.embedding_model.get_embedding_dim()
        )
        self.data_dir = data_dir

    def build_index(self, force_rebuild=False):
        """Build the vector index from the dataset chunks."""
        index_file = os.path.join(self.data_dir, "vector_index.faiss")
        texts_file = os.path.join(self.data_dir, "vector_texts.npy")

        # Load from disk if exists and not forced to rebuild
        if (
            os.path.exists(index_file)
            and os.path.exists(texts_file)
            and not force_rebuild
        ):
            self.vector_store.index = faiss.read_index(index_file)
            self.vector_store.texts = np.load(texts_file, allow_pickle=True).tolist()
            print(
                f"Loaded existing index with {len(self.vector_store.texts)} documents."
            )
            return

        # Load datasets
        print("Building vector index...")

        # Load train, validation, test datasets
        train_path = os.path.join(self.data_dir, "train.json")
        validation_path = os.path.join(self.data_dir, "validation.json")
        test_path = os.path.join(self.data_dir, "test.json")

        train_data = load_dataset("json", data_files=train_path)["train"]
        validation_data = load_dataset("json", data_files=validation_path)["train"]
        test_data = load_dataset("json", data_files=test_path)["train"]

        # Combine all contexts
        all_contexts = []
        seen_contexts = set()

        # Helper to add unique contexts
        def add_unique_contexts(dataset):
            for item in dataset:
                context = item["context"]
                if context not in seen_contexts:
                    all_contexts.append(context)
                    seen_contexts.add(context)

        add_unique_contexts(train_data)
        add_unique_contexts(validation_data)
        add_unique_contexts(test_data)

        print(f"Found {len(all_contexts)} unique contexts.")

        # Generate embeddings
        embeddings = self.embedding_model.get_embeddings(all_contexts)

        # Add to vector store
        self.vector_store.add_texts(all_contexts, embeddings)

        # Save to disk
        faiss.write_index(self.vector_store.index, index_file)
        np.save(texts_file, np.array(self.vector_store.texts, dtype=object))

        print(f"Built and saved index with {len(all_contexts)} documents.")

    def retrieve(self, query: str, k: int = 3) -> List[str]:
        """Retrieve relevant contexts for a query."""
        # Generate query embedding
        query_embedding = self.embedding_model.get_embeddings([query])[0]

        # Search in vector store
        results = self.vector_store.search(query_embedding, k=k)

        # Return contexts
        return [item["text"] for item in results]

In [12]:
class ModelInference:
    def __init__(
        self,
        model_path: str = "./quantized_model/model.gguf",
        use_rag: bool = True,
        context_length: int = 4096,
        num_retrieved_docs: int = 3,
    ):
        self.model_path = model_path
        self.use_rag = use_rag
        self.num_retrieved_docs = num_retrieved_docs

        # Initialize Llama model
        self.llm = Llama(
            model_path=model_path,
            n_ctx=context_length,
            n_batch=512,
            n_gpu_layers=-1,  # Use all layers on GPU if available
        )

        # Initialize RAG system if needed
        if use_rag:
            self.rag = RAGSystem()
            self.rag.build_index()

    def retrieve_context(self, query: str) -> str:
        """Retrieve relevant context using RAG with token count limiting."""
        if not self.use_rag:
            return ""

        contexts = self.rag.retrieve(query, k=self.num_retrieved_docs)

        # Calculate token budgets
        system_prompt = "You are an AI assistant that specializes in answering questions about AI research papers."
        query_prompt = f"Question: {query}"
        combined_prompt = system_prompt + query_prompt

        # Fix: Use the more reliable approach with llama_cpp
        # Reserve tokens for the system prompt, query, and generated response
        try:
            # Use the proper encoding with llama_cpp
            reserved_tokens = (
                len(self.llm.tokenize(bytes(combined_prompt, "utf-8"))) + 1024
            )
        except TypeError:
            # Fallback method if bytes conversion doesn't work
            reserved_tokens = len(combined_prompt.split()) * 2 + 1024  # Approximate

        max_context_tokens = self.llm.n_ctx() - reserved_tokens

        # Start with all contexts and trim as needed
        selected_contexts = []
        current_tokens = 0

        for context in contexts:
            try:
                context_tokens = len(self.llm.tokenize(bytes(context, "utf-8")))
            except TypeError:
                # Fallback approximation
                context_tokens = len(context.split()) * 2

            if current_tokens + context_tokens <= max_context_tokens:
                selected_contexts.append(context)
                current_tokens += context_tokens
            else:
                # Try to add a truncated version if it's the first context
                if len(selected_contexts) == 0:
                    # Estimate truncation point (rough approximation)
                    max_chars = int(max_context_tokens / context_tokens * len(context))
                    truncated = context[:max_chars]
                    selected_contexts.append(truncated)
                break

        return "\n\n".join(selected_contexts)

    def format_prompt(self, query: str, context: Optional[str] = None) -> str:
        """Format the prompt for the model."""
        system_message = "You are an AI assistant that specializes in answering questions about AI research papers. Provide comprehensive, accurate responses based on the information available to you."

        if context:
            prompt = f"""### System:
{system_message}

### Human:
I have a question about an AI research paper.

Here is some relevant context:
{context}

Question: {query}

### Assistant:
"""
        else:
            prompt = f"""### System:
{system_message}

### Human:
Question about AI research: {query}

### Assistant:
"""
        return prompt

    def generate_answer(self, query: str) -> str:
        """Generate an answer for a query."""
        # Retrieve context if using RAG
        context = self.retrieve_context(query) if self.use_rag else None

        # Format prompt
        prompt = self.format_prompt(query, context)

        # Generate response
        start_time = time.time()
        response = self.llm(
            prompt,
            max_tokens=1024,
            stop=["### Human:", "### System:"],
            temperature=0.7,
            top_p=0.95,
        )
        end_time = time.time()

        # Extract answer text
        answer = response["choices"][0]["text"].strip()

        # Log performance
        print(f"Generation time: {end_time - start_time:.2f} seconds")

        return answer

In [13]:
class ModelQuantizer:
    def __init__(
        self,
        model_path="./fine_tuned_model/final",
        base_model="Qwen/Qwen2.5-3B-Instruct",
        output_dir="./quantized_model",
    ):
        self.model_path = model_path
        self.base_model = base_model
        self.output_dir = output_dir
        self.quantized_model_path = os.path.join(output_dir, "model.gguf")

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

    def load_and_merge_model(self):
        """Load the LoRA model and merge with the base model."""
        print("Loading base model...")

        # Load base model
        base_model = AutoModelForCausalLM.from_pretrained(
            self.base_model,
            torch_dtype=torch.float16,
            device_map="auto",
        )

        # Load LoRA weights
        print("Loading and merging LoRA weights...")
        model = PeftModel.from_pretrained(base_model, self.model_path)

        # Merge LoRA weights with base model
        model = model.merge_and_unload()

        # Save merged model and tokenizer
        merged_model_path = os.path.join(self.output_dir, "merged")
        os.makedirs(merged_model_path, exist_ok=True)

        print(f"Saving merged model to {merged_model_path}...")
        model.save_pretrained(merged_model_path)

        # Save tokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.base_model)
        tokenizer.save_pretrained(merged_model_path)

        print("Model and tokenizer saved successfully.")
        return merged_model_path

    def convert_to_gguf(self, merged_model_path):
        """Convert the merged model to GGUF format with quantization."""
        print("Converting to GGUF format with quantization...")

        # Check for existing GGUF model
        if os.path.exists(self.quantized_model_path):
            print(f"GGUF model already exists at {self.quantized_model_path}")
            user_input = input("Do you want to rebuild it? (y/n): ").lower()
            if user_input != "y":
                print("Using existing GGUF model.")
                return self.quantized_model_path

        # Clone llama.cpp repository if needed
        if not os.path.exists("llama.cpp"):
            try:
                print("Cloning llama.cpp repository...")
                subprocess.run(
                    ["git", "clone", "https://github.com/ggerganov/llama.cpp.git"],
                    check=True,
                )
            except subprocess.CalledProcessError:
                print("Error cloning llama.cpp repository.")
                raise RuntimeError("Failed to clone llama.cpp repository")

        # Build llama.cpp with better error handling
        try:
            print("Building llama.cpp with CMake (this may take a few minutes)...")
            os.makedirs("llama.cpp/build", exist_ok=True)

            # Configure with CMake
            subprocess.run(
                ["cmake", "-S", "llama.cpp", "-B", "llama.cpp/build"], check=True
            )

            # Build with CMake
            subprocess.run(
                ["cmake", "--build", "llama.cpp/build", "--parallel"], check=True
            )

            print("llama.cpp built successfully with CMake")

            # Use convert_hf_to_gguf.py with verbose output to see what's happening
            convert_script = "llama.cpp/convert_hf_to_gguf.py"

            if not os.path.exists(convert_script):
                print(f"ERROR: {convert_script} not found!")
                print("Please verify your llama.cpp installation.")
                raise RuntimeError(f"Conversion script not found: {convert_script}")

            print("\nRunning conversion script with enhanced debugging...")

            # Try conversion with detailed error output - using q8_0 instead of q4_0
            try:
                result = subprocess.run(
                    [
                        "python3",
                        convert_script,
                        merged_model_path,
                        "--outfile",
                        self.quantized_model_path,
                        "--outtype",
                        "q8_0",  # Changed from q4_0 to q8_0
                        "--verbose",
                    ],
                    check=True,
                    capture_output=True,
                    text=True,
                )
                print(result.stdout)

            except subprocess.CalledProcessError as e:
                print("\n===== Conversion Error Details =====")
                print(f"Exit code: {e.returncode}")
                print(f"STDOUT: {e.stdout}")
                print(f"STDERR: {e.stderr}")
                print("===================================\n")

                print(
                    "Trying alternate conversion approach with arch-specific parameters..."
                )
                try:
                    # Try with explicit model architecture parameters - using q8_0
                    result = subprocess.run(
                        [
                            "python3",
                            convert_script,
                            merged_model_path,
                            "--outfile",
                            self.quantized_model_path,
                            "--outtype",
                            "q8_0",  # Changed from q4_0 to q8_0
                            "--model-name",
                            "Qwen",  # Added model name hint
                        ],
                        check=True,
                        capture_output=True,
                        text=True,
                    )
                    print(result.stdout)

                except subprocess.CalledProcessError as e2:
                    print("Alternate approach also failed")
                    print(f"STDOUT: {e2.stdout}")
                    print(f"STDERR: {e2.stderr}")
                    raise RuntimeError("All conversion methods failed")

        except Exception as e:
            print(f"Error during build or conversion process: {e}")
            raise RuntimeError("Failed to convert model to GGUF format")

        print(
            f"Model successfully converted to GGUF format: {self.quantized_model_path}"
        )

        # Copy tokenizer files to output directory
        tokenizer_files = ["tokenizer_config.json", "tokenizer.json"]
        for file in tokenizer_files:
            src_path = os.path.join(merged_model_path, file)
            if os.path.exists(src_path):
                dst_path = os.path.join(self.output_dir, file)
                shutil.copy2(src_path, dst_path)

        return self.quantized_model_path

    def quantize(self):
        """Perform the complete quantization process."""
        merged_model_path = self.load_and_merge_model()
        gguf_path = self.convert_to_gguf(merged_model_path)
        return gguf_path

In [14]:
import os


def main():
    # Generate synthetic dataset with default directories
    print("Generating synthetic dataset...")
    create_synthetic_data(documents_dir="./dataset/q3_dataset", output_dir="./data")

    # Fine-tune the model using default parameters
    print("Fine-tuning Qwen/Qwen2.5-3B-Instruct...")
    fine_tuner = QAFineTuner("Qwen/Qwen2.5-3B-Instruct", "./data", "./fine_tuned_model")
    fine_tuner.load_data()
    fine_tuner.prepare_model()
    fine_tuner.prepare_datasets()
    fine_tuner.train()

    # Quantize the model to GGUF format with default settings
    print("Quantizing the model...")
    quantizer = ModelQuantizer(
        model_path="./fine_tuned_model/final",
        base_model="Qwen/Qwen2.5-3B-Instruct",
        output_dir="./quantized_model",
    )
    quantizer.quantize()

    # Build the RAG index with default directory
    print("Building RAG index...")
    rag = RAGSystem(data_dir="./data")
    rag.build_index(force_rebuild=True)

    # Evaluate the model using default settings (using all available samples)
    print("Evaluating the model...")
    evaluator = Evaluator(
        model_path=os.path.join("./quantized_model", "model.gguf"), data_dir="./data"
    )
    evaluator.evaluate(sample_size=None)

    # Run inference using default parameters and a sample query
    print("Running inference...")
    inference = ModelInference(
        model_path=os.path.join("./quantized_model", "model.gguf"), use_rag=True
    )
    default_query = "What is the latest research in AI?"
    answer = inference.generate_answer(default_query)
    print(f"\nQuery: {default_query}\n")
    print(f"Answer:\n{answer}")


if __name__ == "__main__":
    main()

Generating synthetic dataset...
Loaded 5 documents.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset created with 15 training, 6 validation, and 6 test examples.
Fine-tuning Qwen/Qwen2.5-3B-Instruct...


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loaded 15 training examples and 6 validation examples.
Using bitsandbytes version: 0.45.3


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Successfully loaded model with 4-bit quantization and LoRA adapters
trainable params: 59,867,136 || all params: 3,145,805,824 || trainable%: 1.9031


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Tokenized datasets: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 15
}), Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6
})


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mranuga-d[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[2025-03-09 07:51:02,295] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss


Training complete!
Quantizing the model...
Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading and merging LoRA weights...
Saving merged model to ./quantized_model/merged...
Model and tokenizer saved successfully.
Converting to GGUF format with quantization...
GGUF model already exists at ./quantized_model/model.gguf
Do you want to rebuild it? (y/n): y
Building llama.cpp with CMake (this may take a few minutes)...
llama.cpp built successfully with CMake

Running conversion script with enhanced debugging...

Model successfully converted to GGUF format: ./quantized_model/model.gguf
Building RAG index...
Building vector index...


Generating train split: 0 examples [00:00, ? examples/s]

Found 9 unique contexts.


llama_model_loader: loaded meta data with 27 key-value pairs and 434 tensors from ./quantized_model/model.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
llama_model_loader: - kv   3:                       general.organization str              = Qwen
llama_model_loader: - kv   4:                           general.finetune str              = Instruct
llama_model_loader: - kv   5:                           general.basename str              = Qwen2.5
llama_model_loader: - kv   6:                         general.size_label str              = 3B
llama_model_loader: - kv   7:                          qwen2.bl

Built and saved index with 9 documents.
Evaluating the model...


init_tokenizer: initializing tokenizer for type 2
load: control token: 151660 '<|fim_middle|>' is not marked as EOG
load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
load: control token: 151653 '<|vision_end|>' is not marked as EOG
load: control token: 151648 '<|box_start|>' is not marked as EOG
load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
load: control token: 151649 '<|box_end|>' is not marked as EOG
load: control token: 151655 '<|image_pad|>' is not marked as EOG
load: control token: 151651 '<|quad_end|>' is not marked as EOG
load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
load: control token: 151652 '<|vision_start|>' is not marked as EOG
load: control token: 151654 '<|vision_pad|>' is not marked as EOG
load: control token: 151656 '<|video_pad|>' is not marked as EOG
load: control token: 151644 '<|im_start|>' is not marked as EOG
load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
load: control token: 151

Loaded existing index with 9 documents.


init_tokenizer: initializing tokenizer for type 2
load: control token: 151660 '<|fim_middle|>' is not marked as EOG
load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
load: control token: 151653 '<|vision_end|>' is not marked as EOG
load: control token: 151648 '<|box_start|>' is not marked as EOG
load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
load: control token: 151649 '<|box_end|>' is not marked as EOG
load: control token: 151655 '<|image_pad|>' is not marked as EOG
load: control token: 151651 '<|quad_end|>' is not marked as EOG
load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
load: control token: 151652 '<|vision_start|>' is not marked as EOG
load: control token: 151654 '<|vision_pad|>' is not marked as EOG
load: control token: 151656 '<|video_pad|>' is not marked as EOG
load: control token: 151644 '<|im_start|>' is not marked as EOG
load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
load: control token: 151

Evaluating on 6 test examples...
Processing example 1/6...


llama_perf_context_print:        load time =   61333.42 ms
llama_perf_context_print: prompt eval time =   61332.47 ms /  2677 tokens (   22.91 ms per token,    43.65 tokens per second)
llama_perf_context_print:        eval time =   17735.66 ms /   178 runs   (   99.64 ms per token,    10.04 tokens per second)
llama_perf_context_print:       total time =   79371.80 ms /  2855 tokens


Generation time: 79.39 seconds


llama_perf_context_print:        load time =     827.35 ms
llama_perf_context_print: prompt eval time =     827.16 ms /    47 tokens (   17.60 ms per token,    56.82 tokens per second)
llama_perf_context_print:        eval time =    3377.19 ms /    43 runs   (   78.54 ms per token,    12.73 tokens per second)
llama_perf_context_print:       total time =    4270.60 ms /    90 tokens
Llama.generate: 2676 prefix-match hit, remaining 1 prompt tokens to eval


Generation time: 4.28 seconds
Processing example 2/6...


llama_perf_context_print:        load time =   61333.42 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   20539.93 ms /   205 runs   (  100.19 ms per token,     9.98 tokens per second)
llama_perf_context_print:       total time =   20896.29 ms /   206 tokens
Llama.generate: 46 prefix-match hit, remaining 1 prompt tokens to eval


Generation time: 20.91 seconds


llama_perf_context_print:        load time =     827.35 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    2956.41 ms /    37 runs   (   79.90 ms per token,    12.52 tokens per second)
llama_perf_context_print:       total time =    3012.81 ms /    38 tokens
Llama.generate: 2676 prefix-match hit, remaining 1 prompt tokens to eval


Generation time: 3.02 seconds
Processing example 3/6...


llama_perf_context_print:        load time =   61333.42 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   39643.82 ms /   396 runs   (  100.11 ms per token,     9.99 tokens per second)
llama_perf_context_print:       total time =   40399.83 ms /   397 tokens
Llama.generate: 46 prefix-match hit, remaining 1 prompt tokens to eval


Generation time: 40.41 seconds


llama_perf_context_print:        load time =     827.35 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    3773.80 ms /    47 runs   (   80.29 ms per token,    12.45 tokens per second)
llama_perf_context_print:       total time =    3846.82 ms /    48 tokens
Llama.generate: 2672 prefix-match hit, remaining 5 prompt tokens to eval


Generation time: 3.85 seconds
Processing example 4/6...


llama_perf_context_print:        load time =   61333.42 ms
llama_perf_context_print: prompt eval time =     186.60 ms /     5 tokens (   37.32 ms per token,    26.79 tokens per second)
llama_perf_context_print:        eval time =   12027.23 ms /   120 runs   (  100.23 ms per token,     9.98 tokens per second)
llama_perf_context_print:       total time =   12416.56 ms /   125 tokens
Llama.generate: 42 prefix-match hit, remaining 5 prompt tokens to eval


Generation time: 12.43 seconds


llama_perf_context_print:        load time =     827.35 ms
llama_perf_context_print: prompt eval time =     412.93 ms /     5 tokens (   82.59 ms per token,    12.11 tokens per second)
llama_perf_context_print:        eval time =    4896.62 ms /    62 runs   (   78.98 ms per token,    12.66 tokens per second)
llama_perf_context_print:       total time =    5405.69 ms /    67 tokens
Llama.generate: 50 prefix-match hit, remaining 1916 prompt tokens to eval


Generation time: 5.41 seconds
Processing example 5/6...


llama_perf_context_print:        load time =   61333.42 ms
llama_perf_context_print: prompt eval time =   42641.20 ms /  1916 tokens (   22.26 ms per token,    44.93 tokens per second)
llama_perf_context_print:        eval time =   38200.33 ms /   400 runs   (   95.50 ms per token,    10.47 tokens per second)
llama_perf_context_print:       total time =   81608.41 ms /  2316 tokens
Llama.generate: 42 prefix-match hit, remaining 5 prompt tokens to eval


Generation time: 81.62 seconds


llama_perf_context_print:        load time =     827.35 ms
llama_perf_context_print: prompt eval time =     692.47 ms /     5 tokens (  138.49 ms per token,     7.22 tokens per second)
llama_perf_context_print:        eval time =    4726.69 ms /    57 runs   (   82.92 ms per token,    12.06 tokens per second)
llama_perf_context_print:       total time =    5508.80 ms /    62 tokens
Llama.generate: 1965 prefix-match hit, remaining 1 prompt tokens to eval


Generation time: 5.51 seconds
Processing example 6/6...


llama_perf_context_print:        load time =   61333.42 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   48315.80 ms /   502 runs   (   96.25 ms per token,    10.39 tokens per second)
llama_perf_context_print:       total time =   49332.63 ms /   503 tokens
Llama.generate: 46 prefix-match hit, remaining 1 prompt tokens to eval


Generation time: 49.34 seconds


llama_perf_context_print:        load time =     827.35 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    4636.32 ms /    58 runs   (   79.94 ms per token,    12.51 tokens per second)
llama_perf_context_print:       total time =    4725.47 ms /    59 tokens
llama_model_loader: loaded meta data with 27 key-value pairs and 434 tensors from ./quantized_model/model.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
llama_model_loader: - kv   3:                       general.organizatio

Generation time: 4.73 seconds

Evaluation Results:

With RAG:
bleu: 0.0009
rouge1: 0.0209
rouge2: 0.0000
rougeL: 0.0177

Without RAG:
bleu: 0.0018
rouge1: 0.0210
rouge2: 0.0000
rougeL: 0.0210
Running inference...


init_tokenizer: initializing tokenizer for type 2
load: control token: 151660 '<|fim_middle|>' is not marked as EOG
load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
load: control token: 151653 '<|vision_end|>' is not marked as EOG
load: control token: 151648 '<|box_start|>' is not marked as EOG
load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
load: control token: 151649 '<|box_end|>' is not marked as EOG
load: control token: 151655 '<|image_pad|>' is not marked as EOG
load: control token: 151651 '<|quad_end|>' is not marked as EOG
load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
load: control token: 151652 '<|vision_start|>' is not marked as EOG
load: control token: 151654 '<|vision_pad|>' is not marked as EOG
load: control token: 151656 '<|video_pad|>' is not marked as EOG
load: control token: 151644 '<|im_start|>' is not marked as EOG
load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
load: control token: 151

Loaded existing index with 9 documents.


llama_perf_context_print:        load time =   62722.55 ms
llama_perf_context_print: prompt eval time =   62721.94 ms /  2742 tokens (   22.87 ms per token,    43.72 tokens per second)
llama_perf_context_print:        eval time =   95608.26 ms /   933 runs   (  102.47 ms per token,     9.76 tokens per second)
llama_perf_context_print:       total time =  160595.86 ms /  3675 tokens


Generation time: 160.61 seconds

Query: What is the latest research in AI?

Answer:
The context provided does not directly address the latest research in AI. However, I can provide a summary of the key advancements mentioned in the Deepseek V3 model and the accompanying research papers:

### Key Advancements in Deepseek V3:

1. **Mixture-of-Experts (MoE) Architecture**:
   - Employed a MoE architecture where only 37 billion parameters fire for each token out of the total 671 billion.
   - This sparse activation significantly reduces compute requirements compared to dense models.

2. **FP8 Mixed Precision Training**:
   - Implemented an FP8 mixed precision training framework.
   - Reduced memory usage and accelerated training compared to higher precision formats.
   - Achieved up to 50% reduction in memory footprint compared to traditional FP16/FP32 formats.
   - Used fine-grained quantization strategies and increased accumulation precision to maintain accuracy.

3. **Load Balancing Str

In [None]:
!zip -r /content/content.zip /content

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: content/.config/active_config (stored 0%)
  adding: content/.config/.last_update_check.json (deflated 22%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2025.03.06/ (stored 0%)
  adding: content/.config/logs/2025.03.06/14.28.23.979271.log (deflated 92%)
  adding: content/.config/logs/2025.03.06/14.28.44.811499.log (deflated 58%)
  adding: content/.config/logs/2025.03.06/14.29.03.284363.log (deflated 56%)
  adding: content/.config/logs/2025.03.06/14.28.53.350004.log (deflated 86%)
  adding: content/.config/logs/2025.03.06/14.29.02.658299.log (deflated 57%)
  adding: content/.config/logs/2025.03.06/14.28.54.467455.log (deflated 57%)
  adding: content/.config

In [None]:
from google.colab import files

files.download("/content/content.zip")