<a href="https://colab.research.google.com/github/Smol-Kaiju/SmolGhidorah/blob/main/smolGhidorah_PsuedoMoE_4Bit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers sentence-transformers accelerate optimum
!pip install -U bitsandbytes
!pip list | grep bitsandbytes

from google.colab import drive
drive.mount('/content/drive')

Quant and save

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from sentence_transformers import SentenceTransformer
import os

# Define the models and their save paths in Google Drive
models = {
    "Qwen2.5-3B-Instruct": "Qwen/Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct_MATH_training_response_Qwen2.5_1.5B_only_right": "cutelemonlili/Qwen2.5-1.5B-Instruct_MATH_training_response_Qwen2.5_1.5B_only_right",
    "QwQ-LCoT-3B-Instruct": "prithivMLmods/QwQ-LCoT-3B-Instruct"
}

# Define the base path in Google Drive where the models will be saved
base_drive_path = '/content/drive/MyDrive/models/'

# Download, Quantize, and Save Models
for model_name, model_id in models.items():
    print(f"Downloading and saving {model_name}...")
    # Create the directory for the model
    model_path = os.path.join(base_drive_path, model_name)
    os.makedirs(model_path, exist_ok=True)

    # Define the quantization configuration
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=False,  # Set to True if you want 4-bit quantization
        load_in_8bit=True,   # Set to True if you want 8-bit quantization
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Download the tokenizer and model with quantization
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)

    # Save the tokenizer and quantized model to the specified directory
    tokenizer.save_pretrained(model_path)
    model.save_pretrained(model_path)
    print(f"{model_name} saved to {model_path}")

# Save all-MiniLM-L6-v2 Model
mini_lm_model_name = "all-MiniLM-L6-v2"
mini_lm_model_path = os.path.join(base_drive_path, mini_lm_model_name)
os.makedirs(mini_lm_model_path, exist_ok=True)

# Download the all-MiniLM-L6-v2 model
mini_lm_model = SentenceTransformer(mini_lm_model_name)

# Save the model to the specified directory
mini_lm_model.save(mini_lm_model_path)
print(f"{mini_lm_model_name} saved to {mini_lm_model_path}")

print("All models have been downloaded, quantized (where applicable), and saved to Google Drive.")

Run The Models (Local)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer, util

# Configuration
general_model_path = "Qwen/Qwen2.5-3B-Instruct"
math_reasoning_1_model_path = "cutelemonlili/Qwen2.5-1.5B-Instruct_MATH_training_response_Qwen2.5_1.5B_only_right"
reasoning_2_model_path = "prithivMLmods/QwQ-LCoT-3B-Instruct"
embedding_model_name = "all-MiniLM-L6-v2"

# Keyword-Based Router
reasoning_keywords = ["think", "explain", "why", "how", "infer", "deduce", "analyze", "solve", "reason", "logic", "therefore", "because", "consequence"]

def is_reasoning_query(query):
    query_lower = query.lower()
    for keyword in reasoning_keywords:
        if keyword in query_lower:
            return True
    return False

# Embedding-Based Sub-Router
# Example reasoning queries
math_reasoning_queries_1 = [
    "Solve the equation 2x + 5 = 15.",
    "What is the derivative of x^2?",
    "Calculate the area of a circle with radius 5.",
    "Find the sum of 25 and 37.",
    "What is the probability of rolling a 6 on a fair die?",
    "Simplify the expression (x + 2)(x - 3).",
    "What is the square root of 144?",
    "Find the value of sin(30 degrees).",
    "If f(x) = 3x^2 + 2x - 5, what is f(2)?",
    "What is the slope of the line y = 2x + 3?"
]

reasoning_queries_2 = [
    "How can we solve this problem using a logical approach?",
    "Analyze the different factors contributing to this outcome and explain their causal relationships.",
    "What is the reasoning for the observed phenomenon based on the provided evidence?",
    "Think step by step and solve this problem",
    "Think step by step and reach a conclusion",
    "Break down this task into easy steps"
]

def generate_embedding(text, model):
    sentences = text if isinstance(text, list) else [text]
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

def calculate_similarity(embedding1, embedding2):
    return util.pytorch_cos_sim(embedding1, embedding2)

# Load embedding model
embedding_model = SentenceTransformer(embedding_model_name)

# Generate and store average embeddings
avg_embedding_reasoning_1 = generate_embedding(math_reasoning_queries_1, embedding_model).mean(dim=0, keepdim=True)
avg_embedding_reasoning_2 = generate_embedding(reasoning_queries_2, embedding_model).mean(dim=0, keepdim=True)

def route_to_reasoning_expert(query):
    query_embedding = generate_embedding(query, embedding_model)
    similarity_1 = calculate_similarity(query_embedding, avg_embedding_reasoning_1)
    similarity_2 = calculate_similarity(query_embedding, avg_embedding_reasoning_2)

    if similarity_1 > similarity_2:
        return math_reasoning_1_model_path
    else:
        return reasoning_2_model_path

# Model Loading/Unloading (with 4-bit Quantization)
def load_expert(model_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Configuration for 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    # Load the model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    return model, tokenizer

def unload_expert(model, tokenizer):
    del model
    del tokenizer
    torch.cuda.empty_cache()

# Main Router
def route_query(query):
    if is_reasoning_query(query):
        model_path = route_to_reasoning_expert(query)
    else:
        model_path = general_model_path

    # Load expert
    expert, tokenizer = load_expert(model_path)

    # Generate response
    inputs = tokenizer(query, return_tensors="pt").to(expert.device)
    with torch.no_grad():
        outputs = expert.generate(**inputs, max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Unload expert
    unload_expert(expert, tokenizer)

    return response

# UI
if __name__ == "__main__":
    while True:
        query = input("Enter your query (or 'quit' to exit): ")
        if query.lower() == "quit":
            break
        response = route_query(query)
        print("Response:", response)