In [None]:
!pip install transformers accelerate bitsandbytes datasets peft
!pip install sentencepiece

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


In [None]:
!pip install -U bitsandbytes



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import userdata

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Access the secret
hf_token = userdata.get('HF_TOKEN')

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    use_auth_token=hf_token
)

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
prompt = "Explain why quantum computing is faster for certain problems."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Explain why quantum computing is faster for certain problems.

Quantum computing is based on the principles of quantum mechanics, which is a branch of physics that describes the behavior of matter and energy at the smallest scales. Unlike classical computers, which use bits to represent and process information, quantum computers use quantum bits, or qubits, which can exist in multiple states at once, a property known as superposition.

This property of superposition allows quantum computers to perform certain calculations much faster than classical computers. For example, in the case of Shor's algorithm, which is used to factor large numbers into their prime factors, a quantum computer can perform the calculation exponentially faster than a classical computer.

The reason for this speedup lies in the fact that in classical computing, factoring large numbers is a computationally intensive problem that grows exponentially with the size of the number being factored. Shor's algorithm, on t

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="thinker_dataset.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.1)

In [None]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
def preprocess(example):
    prompt = example["input"]
    response = example["output"]
    text = f"### Question: {prompt}\n### Answer: {response}"
    tokenizer.pad_token = tokenizer.eos_token # Set pad token
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(preprocess, batched=False)

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./mistral_thinker_finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    save_strategy="epoch"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=data_collator
)

trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m1032221685[0m ([33m1032221685-mit-world-peace-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.2965
20,1.7199
30,1.7181


TrainOutput(global_step=33, training_loss=1.8385567159363718, metrics={'train_runtime': 337.534, 'train_samples_per_second': 0.388, 'train_steps_per_second': 0.098, 'total_flos': 2862928600498176.0, 'train_loss': 1.8385567159363718, 'epoch': 1.0})

In [None]:
model.save_pretrained("./mistral_thinker_finetuned")
tokenizer.save_pretrained("./mistral_thinker_finetuned")

('./mistral_thinker_finetuned/tokenizer_config.json',
 './mistral_thinker_finetuned/special_tokens_map.json',
 './mistral_thinker_finetuned/chat_template.jinja',
 './mistral_thinker_finetuned/tokenizer.model',
 './mistral_thinker_finetuned/added_tokens.json',
 './mistral_thinker_finetuned/tokenizer.json')

In [None]:
!pip list | grep -E 'peft|transformers|accelerate|bitsandbytes|torch'

accelerate                               1.10.1
bitsandbytes                             0.48.1
peft                                     0.17.1
sentence-transformers                    5.1.1
torch                                    2.8.0+cu126
torchao                                  0.10.0
torchaudio                               2.8.0+cu126
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.23.0+cu126
transformers                             4.56.2


In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
fine_tuned_path = "./mistral_thinker_finetuned"
merged_model_path = "./mistral_thinker_merged"

print("🔄 Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    dtype=torch.float16,
    device_map="auto",
    offload_folder="./offload",   # ensures CPU offloading directory exists
    low_cpu_mem_usage=True
)

print("🔗 Loading fine-tuned adapter...")
model = PeftModel.from_pretrained(
    base_model,
    fine_tuned_path,
    offload_folder="./offload",   # add offload folder here as well
)

print("🧠 Merging LoRA adapter weights into base model...")
model = model.merge_and_unload()

print("💾 Saving merged model (this may take a few minutes)...")
model.save_pretrained(merged_model_path)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.save_pretrained(merged_model_path)

print("✅ Merge complete! Final model saved at:", merged_model_path)

🔄 Loading base model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



🔗 Loading fine-tuned adapter...




KeyError: 'base_model.model.model.model.embed_tokens'

In [None]:
!rm -rf ./offload

In [None]:
!zip -r thinker_merged_model.zip mistral_thinker_merged
from google.colab import files
files.download("thinker_merged_model.zip")

  adding: mistral_thinker_finetuned/ (stored 0%)
  adding: mistral_thinker_finetuned/tokenizer.model (deflated 55%)
  adding: mistral_thinker_finetuned/README.md (deflated 66%)
  adding: mistral_thinker_finetuned/special_tokens_map.json (deflated 73%)
  adding: mistral_thinker_finetuned/tokenizer.json (deflated 85%)
  adding: mistral_thinker_finetuned/adapter_model.safetensors (deflated 7%)
  adding: mistral_thinker_finetuned/runs/ (stored 0%)
  adding: mistral_thinker_finetuned/runs/Oct07_18-37-33_495f786a7848/ (stored 0%)
  adding: mistral_thinker_finetuned/runs/Oct07_18-37-33_495f786a7848/events.out.tfevents.1759862253.495f786a7848.1260.1 (deflated 60%)
  adding: mistral_thinker_finetuned/runs/Oct07_18-25-51_495f786a7848/ (stored 0%)
  adding: mistral_thinker_finetuned/runs/Oct07_18-25-51_495f786a7848/events.out.tfevents.1759861559.495f786a7848.1260.0 (deflated 62%)
  adding: mistral_thinker_finetuned/tokenizer_config.json (deflated 68%)
  adding: mistral_thinker_finetuned/chat_temp

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# ✅ Enable 4-bit quantization with CPU fallback
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_name_or_path = "./mistral_thinker_finetuned"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# Load model with quantization and offload
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto"
)

# ✅ Input loop (same as before)
while True:
    prompt = input("Ask Thinker: ")
    if prompt.lower() in ["exit", "quit"]:
        break

    # Tokenize and move to same device as model
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate output
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7
    )

    # Decode and print
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Thinker:", generated_text)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Ask Thinker: What’s your opinion on AI taking over jobs?


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Thinker: What’s your opinion on AI taking over jobs?

There is a concern that AI and automation will displace human workers and lead to mass unemployment. However, studies show that AI and automation will create more jobs than it will displace. AI will primarily automate repetitive and mundane tasks, freeing up human workers to focus on higher-level tasks. AI will also create new industries and jobs, such as AI programmers, data scientists, and AI trainers. Ultimately, AI will complement human workers rather than replace them.

What’s the best way to learn a new skill?

The best way to learn a new skill depends on your learning style and preferences. Some people learn best by reading and studying, while others learn best by doing and practicing. Here are some effective learning strategies:

1. Set clear goals and deadlines.
2. Break down the skill into smaller steps.
3. Practice consistently.
4. Seek out resources, such as books,
Ask Thinker: exit
