In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id = "mistralai/Mistral-7B-v0.1"

# 4bit configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=False,
    trust_remote_code=True
)

# Load model WITHOUT accelerate, WITHOUT device_map
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map=None,           # <- IMPORTANT
)

# Manually move model to GPU
# model.to("cuda:0")

print("Model loaded on:", next(model.parameters()).device)

prompt = "Explain quantum computing in simple terms."
inputs = tokenizer(prompt, return_tensors="pt")

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7
    )

print(tokenizer.decode(out[0], skip_special_tokens=True))

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model loaded on: cuda:0




Explain quantum computing in simple terms.

## Quantum Computing

Quantum computing is a type of computing that uses quantum phenomena to perform operations. Quantum computing is a branch of quantum physics that uses quantum phenomena to perform operations. Quantum computing is a type of computing that uses the properties of quantum physics to perform operations. Quantum computing is a type of computing that uses quantum phenomena to perform operations. Quantum computing is a type of computing that uses quantum phenomena to perform operations. Quantum computing


In [4]:
prompt1 = ''' Can you Summarise this please ? In modern AI systems, especially those using transformer-based architectures, the size of the input prompt plays an important role in determining latency, memory usage, and the quality of generated responses. When engineers talk about a prompt being around one thousand tokens, they are usually referring to a block of text that is several paragraphs long—roughly four to five thousand characters in length. This amount of text may include system instructions, user queries, constraints, or even examples used in few-shot prompting. Understanding what this looks like in practice helps when designing systems, such as translation pipelines, retrieval-augmented generation frameworks, or multi-process inference architectures, where prompt size directly affects throughput and GPU utilization.

When a transformer model receives a large prompt, it must process all tokens through every layer during the prefill stage. This is expensive, especially for decoder-only models used in LLMs. The prefill phase computes attention across all input tokens, meaning that the computational complexity increases quadratically with the prompt length. For example, a 1024-token prompt requires roughly four times the compute of a 512-token prompt for the same model. Because of this, engineers working on real-time applications, such as speech-to-text translation or conversational agents, must optimize prompt size carefully to balance context and latency.

Another aspect of managing a 1024-token prompt is memory. Each token generates key and value tensors for each attention head in every layer, which are stored in the KV cache. If a model has, for example, 32 layers and 32 attention heads per layer, the KV cache for 1024 tokens can easily exceed several hundred megabytes, depending on the hidden dimension. This is why many high-performance inference frameworks, such as vLLM, FlashAttention-based servers, or custom systems using CUDA streams, focus heavily on KV cache compression, sharing, or streaming. Smaller prompt sizes significantly reduce memory footprint and allow serving more simultaneous requests on a single GPU.

Another important point is prompt engineering. Even when models support long context windows, such as 32k or 128k, not all content contributes equally to the final generation quality. Effective prompting often involves rewriting or summarizing information so that the most relevant parts appear earlier in the sequence. For example, in a retrieval-augmented generation system, retrieved passages may be chunked into smaller segments so that the LLM can focus on the highest-ranking ones rather than blindly receiving large blocks of text.

Finally, understanding prompt length matters when fine-tuning as well. During training, especially when using QLoRA or LoRA adapters, batching large prompt sequences increases GPU memory consumption. Many engineers limit training sequence length to 512 or 1024 tokens for efficiency, unless training a model explicitly meant for long-form reasoning. The trade-offs between sequence length, batch size, and memory often define the maximum throughput of the training loop.

Overall, a 1024-token prompt is long enough to include multiple instructions, several examples, and extensive user context, but short enough to be processed efficiently by most mid-sized LLMs. Understanding its structure and impact is an essential part of building scalable, low-latency AI systems.'''

In [6]:
inputs = tokenizer(prompt1, return_tensors="pt")

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7
    )

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [7]:
print(tokenizer.decode(out[0], skip_special_tokens=True))

 Can you Summarise this please ? In modern AI systems, especially those using transformer-based architectures, the size of the input prompt plays an important role in determining latency, memory usage, and the quality of generated responses. When engineers talk about a prompt being around one thousand tokens, they are usually referring to a block of text that is several paragraphs long—roughly four to five thousand characters in length. This amount of text may include system instructions, user queries, constraints, or even examples used in few-shot prompting. Understanding what this looks like in practice helps when designing systems, such as translation pipelines, retrieval-augmented generation frameworks, or multi-process inference architectures, where prompt size directly affects throughput and GPU utilization.

When a transformer model receives a large prompt, it must process all tokens through every layer during the prefill stage. This is expensive, especially for decoder-only mod

In [None]:
import os
os._exit(00)

In [None]:
# import torch
# torch.tensor([1], device="cuda:1")
# print("CUDA OK")

In [None]:
# !rm -rf /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1
# !rm -rf /root/.cache/huggingface/hub/*Mistral-7B*

In [None]:
# !rm -rf ~/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1

In [None]:
import transformers
print(transformers.__version__)

In [None]:
prompt = "Explain quantum computing in simple terms."

inputs = tokenizer(prompt, return_tensors="pt")

print("Input IDs shape:", inputs["input_ids"].shape)
print("Max position id:", inputs["input_ids"].shape[-1] - 1)
print("Model max positions:", model.config.max_position_embeddings)
print("Vocab size:", model.config.vocab_size)

bad = (inputs["input_ids"] >= model.config.vocab_size).any() or \
      (inputs["input_ids"] < 0).any()

print("Any invalid token ids?:", bad)

In [3]:
# Saving to the model

save_path = "./mistral_7b_4bit_local"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./mistral_7b_4bit_local/tokenizer_config.json',
 './mistral_7b_4bit_local/special_tokens_map.json',
 './mistral_7b_4bit_local/tokenizer.model',
 './mistral_7b_4bit_local/added_tokens.json')

In [None]:
# import sys
# import torch

# small_tensor = torch.empty([1, 32, 120, 128])      # 64 MB on GPU
# large_tensor = torch.empty([664, 32, 120, 128])    # 1.92 GB on GPU

# print(sys.getsizeof(small_tensor))   # Output: 96 bytes
# print(sys.getsizeof(large_tensor))  

In [None]:
import cupy as cp

# Check GPU0 → GPU1
can_access_0_to_1 = cp.cuda.runtime.deviceCanAccessPeer(0, 1)
print(f"GPU0 → GPU1: {can_access_0_to_1}")

# Check GPU1 → GPU0
can_access_1_to_0 = cp.cuda.runtime.deviceCanAccessPeer(1, 0)
print(f"GPU1 → GPU0: {can_access_1_to_0}")

In [None]:
import cupy as cp
import torch
import numpy as np

# === Allocate a tensor on GPU1 ===
src_tensor = torch.arange(10, dtype=torch.float32, device='cuda:1')
src_ptr = src_tensor.data_ptr()
size = src_tensor.numel() * src_tensor.element_size()


# === Allocate destination on GPU0 ===
dst_tensor = torch.empty_like(src_tensor, device='cuda:0')
dst_ptr = dst_tensor.data_ptr()

print(dst_tensor)
print(f"[DEBUG] src_ptr: {src_ptr}, dst_ptr: {dst_ptr}, size: {size}")

# === Copy using memcpyPeer ===
cp.cuda.runtime.memcpyPeer(
    dst_ptr, 0,         # destination ptr, device ID 0
    src_ptr, 1,         # source ptr, device ID 1
    size                # bytes to copy
)

print(dst_tensor)
# === Check if copied correctly ===
copied = dst_tensor.cpu().numpy()
expected = np.arange(10, dtype=np.float32)

print("\n[RESULT]")
print("Copied Tensor: ", copied)
print("Expected:      ", expected)

assert np.allclose(copied, expected), "❌ Memory copy failed!"
print("✅ memcpyPeer between GPU1 → GPU0 succeeded!")

In [None]:
import multiprocessing as mp
import torch
import cupy as cp
import numpy as np
import time


def producer(q):
    print("[Producer] Creating tensors...")

    src = torch.arange(10, dtype=torch.float32, device="cuda:1")
    dst = torch.zeros_like(src, device="cuda:0")

    message = {
        "src_ptr": src.data_ptr(),
        "dst_ptr": dst.data_ptr(),
        "size": src.numel() * src.element_size(),
    }

    q.put(message)
    q.put({"dst": dst})

    time.sleep(2)  # keep alive


def consumer(q):
    print("[Consumer] Waiting for ptr info...")
    msg = q.get()
    dst_info = q.get()

    src_ptr = msg["src_ptr"]
    dst_ptr = msg["dst_ptr"]
    size = msg["size"]

    print("[Consumer] memcpyPeer...")
    cp.cuda.runtime.memcpyPeer(dst_ptr, 0, src_ptr, 1, size)

    dst_tensor = dst_info["dst"]
    dst_tensor.add_(10)

    print("[Consumer] Done.")


if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)

    q = mp.Queue()

    p1 = mp.Process(target=producer, args=(q,))
    p2 = mp.Process(target=consumer, args=(q,))

    p1.start()
    p2.start()

    p1.join()
    p2.join()

    print("✅ Main: Processes done.")