In [1]:
%%capture
# Skip restarting message in Colab
import sys; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None

!pip install unsloth vllm
!pip install --upgrade pillow

In [2]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-09 11:55:46 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.48.3. vLLM: 0.7.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 49.53%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

INFO 03-09 11:56:06 cuda.py:178] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 03-09 11:56:06 cuda.py:226] Using XFormers backend.
INFO 03-09 11:56:06 model_runner.py:1110] Starting to load model unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit...
INFO 03-09 11:56:07 loader.py:1089] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 03-09 11:56:07 weight_utils.py:254] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

INFO 03-09 11:56:24 weight_utils.py:270] Time spent downloading weights for unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit: 16.338825 seconds


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-09 11:56:30 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 03-09 11:56:31 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-09 11:56:42 worker.py:267] Memory profiling takes 10.52 seconds
INFO 03-09 11:56:42 worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.50) = 7.30GiB
INFO 03-09 11:56:42 worker.py:267] model weights take 2.22GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.05GiB; the rest of the memory reserved for KV Cache is 4.01GiB.
INFO 03-09 11:56:42 executor_base.py:111] # cuda blocks: 7300, # CPU blocks: 3640
INFO 03-09 11:56:42 executor_base.py:116] Maximum concurrency for 1024 tokens per request: 114.06x
INFO 03-09 11:56:44 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:47<00:00,  1.76s/it]

INFO 03-09 11:57:32 model_runner.py:1562] Graph capturing finished in 48 secs, took 0.62 GiB
INFO 03-09 11:57:32 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 60.87 seconds





tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.3.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [8]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
You are an helful AI assitance which assit the user in answering current generative ai trends and research details
### Input:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["question"]
    outputs      = examples["answer"]
    texts = []
    for input, output in zip(inputs, outputs):
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [9]:
# prompt: open the dataset as a csv file  using datasets module
from datasets import load_dataset
dataset = load_dataset("csv", data_files="/content/conversations.csv", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/2016 [00:00<?, ? examples/s]

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps = 20,
        max_steps = 120,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Tokenizing to ["text"] (num_proc=2):   0%|          | 0/2016 [00:00<?, ? examples/s]

In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,016 | Num Epochs = 1 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 119,734,272/1,919,856,640 (6.24% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mravindu-21[0m ([33mravindu-21-university-of-moratuwa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,4.3158
2,4.2478
3,4.2274
4,4.1306
5,3.8499
6,3.5958
7,3.3761
8,3.1046
9,2.8468
10,2.8005


In [14]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : "What are the two reasoning models introduced by DeepSeek-AI, and how do their training approaches differ?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:18<00:00, 18.68s/it, est. speed input: 2.68 toks/s, output: 28.91 toks/s]


"DeepSeek-AI, a research project by Alibaba Cloud, introduces two reasoning models: the **Differentiable Reasoning Network (DRN)** and the **Continuous Reasoning Network (CRN)**. Both models are designed to learn from and make inferences in continuous space, which is particularly useful in the fields of robotics, autonomous vehicles, and other applications where understanding the spatial and temporal context is crucial.\n\n### Differentiable Reasoning Network (DRN)\n\n**DRN** is a model that learns from differentiable data, meaning the model can be trained by differentiating the loss function with respect to its parameters. This makes it particularly well-suited for training with gradient-based optimization methods, such as backpropagation.\n\n### Continuous Reasoning Network (CRN)\n\n**CRN**, on the other hand, is designed to handle continuous reasoning tasks. It is trained in a way that allows it to learn from a continuous space, which is different from the discrete grid-based method

In [15]:
model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.4G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.77 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 36/36 [00:02<00:00, 12.88it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting qwen2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at model into f16 GGUF format.
The output location will be /content/model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00002.bin'
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> F16, shape = {2048

In [None]:
# prompt: implement a rag using this trained model using an in memory database as well use sentence transformers to calculate the embeddings if needed

import torch
from sentence_transformers import SentenceTransformer, util
from unsloth import FastLanguageModel
from vllm import SamplingParams
import faiss

# Load the fine-tuned model
model_path = "model"  # Replace with the actual path to your saved model
model, tokenizer = FastLanguageModel.from_pretrained(model_path, load_in_4bit=True)

# Load sentence transformer model
sentence_model = SentenceTransformer('all-mpnet-base-v2')

# In-memory database (replace with a persistent database for production)
class InMemoryDB:
    def __init__(self):
        self.documents = []
        self.embeddings = []
        self.index = None

    def add_document(self, text):
        embedding = sentence_model.encode(text)
        self.documents.append(text)
        self.embeddings.append(embedding)

    def build_index(self):
        self.embeddings = torch.tensor(self.embeddings, dtype=torch.float32).cpu()
        self.index = faiss.IndexFlatIP(self.embeddings.shape[1])  # Inner product index
        self.index.add(self.embeddings.numpy())

    def search(self, query, top_k=2):
        query_embedding = sentence_model.encode(query)
        query_embedding = torch.tensor(query_embedding).unsqueeze(0).cpu()
        _, indices = self.index.search(query_embedding.numpy(), top_k)
        return [self.documents[i] for i in indices[0]]


# Initialize the database
db = InMemoryDB()

# Example documents (replace with your actual data)
example_docs = [
    "DeepSeek-AI introduced two reasoning models: ReasoningChain and GraphReasoner.",
    "ReasoningChain uses a chain-of-thought prompting approach.",
    "GraphReasoner employs a graph-based knowledge representation and reasoning mechanism.",
    "The training approaches differ in their underlying data structures and reasoning techniques."
]

for doc in example_docs:
    db.add_document(doc)

db.build_index()


def rag_pipeline(query):
    # 1. Retrieval
    retrieved_documents = db.search(query)

    # 2. Context Aggregation
    context = "\n".join(retrieved_documents)

    # 3. Generation
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Answer the user's question using the provided context.
### Input:
Question: {query}
Context: {context}
### Response:
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
    output = model.generate(**inputs, sampling_params=sampling_params)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example Usage
user_query = "How do the training methods for DeepSeek-AI's reasoning models differ?"
response = rag_pipeline(user_query)
response


In [17]:
!mv /content/outputs /content/drive/MyDrive/intellihack


In [None]:
# prompt: get the size of the folder "/content/outputs"

import os
def get_folder_size(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

size_in_bytes = get_folder_size("/content/outputs")
print(f"The size of the folder /content/outputs is: {size_in_bytes} bytes")
