In [1]:
from huggingface_hub import login
import getpass
import os

# Prompt for the Hugging Face token securely
hf_token = getpass.getpass("Enter your Hugging Face token: ")

# Login to Hugging Face Hub
login(token=hf_token)


Enter your Hugging Face token: ··········


In [2]:
from huggingface_hub import login
import wandb
import getpass


# Login to Weights & Biases
wb_token = getpass.getpass("Enter your Weights & Biases API Key: ")
wandb.login(key=wb_token)

# Initialize W&B run
run = wandb.init(
    project="Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset",
    job_type="training",
    anonymous="allow"
)


Enter your Weights & Biases API Key: ··········


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaadashraf12[0m ([33msaadashraf12-fast-nuces[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
pip install unsloth



In [4]:
from unsloth import SFTTrainer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


ImportError: cannot import name 'SFTTrainer' from 'unsloth' (/usr/local/lib/python3.11/dist-packages/unsloth/__init__.py)

In [5]:
from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

==((====))==  Unsloth 2025.3.10: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
prompt_style = """Below is a math meme with an incorrect solution. Your task is to identify the error and provide a correct explanation.

### Incorrect Meme:
{}

### Identified Error:
{}

### Fixed Explanation:
{}"""


In [7]:
# Ensure the model is set for inference mode
FastLanguageModel.for_inference(model)

def test_math_meme(meme_description):
    input_text = prompt_style.format(meme_description, "", "")

    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=500,
        use_cache=True,
    )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Print full raw response for debugging
    print("Raw Response:\n", response)

    # Extract all sections labeled "### Fixed Explanation:"
    explanations = response.split("### Fixed Explanation")

    # Filter out empty or placeholder explanations
    valid_explanations = [exp.strip() for exp in explanations if exp.strip() and "error" not in exp.lower()]

    if valid_explanations:
        extracted_explanation = valid_explanations[-1]  # Take the last meaningful explanation
        print("\nExtracted Explanation:\n", extracted_explanation)
    else:
        print("\nNo valid explanation detected.")



In [8]:
# Example test case
#test_meme = "A meme stating that sin(90°) = 0"
#test_meme = "A post claiming that log(1) = 1"
test_meme = "A meme stating that 8 ÷ 2(2 + 2) = 1"
test_math_meme(test_meme)

Raw Response:
 Below is a math meme with an incorrect solution. Your task is to identify the error and provide a correct explanation.

### Incorrect Meme:
A meme stating that 8 ÷ 2(2 + 2) = 1

### Identified Error:


### Fixed Explanation:
<Think here>

Okay, so I came across this math meme, and at first glance, it seemed a bit confusing because of the way the numbers are arranged. The meme says 8 ÷ 2(2 + 2) = 1. Hmm, let me try to figure out what's going on here.

Alright, starting with the expression: 8 ÷ 2(2 + 2). I remember that in math, parentheses mean you do what's inside them first. So, let me focus on the part inside the parentheses: 2 + 2. That's straightforward, right? 2 plus 2 is 4. So now the expression becomes 8 ÷ 2(4). 

Wait, hold on, it's 8 divided by 2 times 4. Hmm, I think I need to remember the order of operations here. I recall PEMDAS: Parentheses, Exponents, Multiplication and Division (from left to right). So after doing the parentheses, we have multiplication ne

In [9]:
train_prompt_style = """Below is a math meme with an incorrect solution. Your task is to identify the error, explain it, and provide the correct answer.

### Incorrect Meme:
{}

### Identified Error:
<think>
{}
</think>

### Fixed Explanation:
{}"""


In [10]:
EOS_TOKEN = tokenizer.eos_token  # Ensure EOS token is appended

def formatting_prompts_func(examples):
    descriptions = examples["description"]  # Incorrect meme
    errors = examples["error"]  # Identified mistake
    explanations = examples["explanation"]  # Corrected response

    texts = [
        train_prompt_style.format(desc, err, exp) + EOS_TOKEN
        for desc, err, exp in zip(descriptions, errors, explanations)
    ]

    return texts  # ✅ Return a list instead of a dictionary


In [11]:
from datasets import Dataset
import pandas as pd
import json

dataset_path = "/content/math_meme.jsonl"

# Read JSONL file into a list of dictionaries
with open(dataset_path, "r") as f:
    dataset = [json.loads(line) for line in f]

# Convert list of dictionaries to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(pd.DataFrame(dataset))


In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.3.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [17]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [18]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=hf_dataset,  # ✅ Now a Hugging Face Dataset
    dataset_text_field="text",
    formatting_func=formatting_prompts_func,  # ✅ Ensures text formatting
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/20 [00:00<?, ? examples/s]

In [19]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20 | Num Epochs = 12 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
10,1.8221
20,0.335
30,0.0864
40,0.0574
50,0.0495
60,0.0439


In [20]:
# Define the directory where you want to save the model
output_dir = "my_trained_model"

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to my_trained_model


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Free up GPU memory before loading the model
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()

# Define model path
output_dir = "my_trained_model"

# Choose quantization level: 8-bit or 4-bit
use_4bit = False  # Set to True if you need even lower memory usage

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    load_in_8bit=not use_4bit,  # Use 8-bit if not using 4-bit
    bnb_4bit_compute_dtype=torch.float16 if use_4bit else None,
    bnb_4bit_use_double_quant=True if use_4bit else False,
    llm_int8_enable_fp32_cpu_offload=True if not use_4bit else False  # Offload FP32 to CPU in 8-bit mode
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(output_dir, quantization_config=bnb_config).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Set model max sequence length manually if needed
model.max_seq_length = 2048

# Example inference function
def generate_text(prompt, max_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_tokens,  # Limit token generation to save memory
            do_sample=True,  # Set to False for deterministic output
            temperature=0.3  # Adjust creativity
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test inference
prompt = "A social media post claiming that 0! = 0"
print(generate_text(prompt, max_tokens=100))


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


A social media post claiming that 0! = 0 is incorrect. Explain
A social media post claiming that 0! = 0 is incorrect because 0! (zero factorial) is defined as 1, not 0. This definition is consistent with the factorial function, which for any positive integer n, n! = n × (n-1) × ... × 1. When n=0, the product is 1 by convention.



In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Free up GPU memory before loading the model
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()

# Define model path
output_dir = "my_trained_model"

# Choose quantization level: 8-bit or 4-bit
use_4bit = False  # Set to True if you need even lower memory usage

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    load_in_8bit=not use_4bit,  # Use 8-bit if not using 4-bit
    bnb_4bit_compute_dtype=torch.float16 if use_4bit else None,
    bnb_4bit_use_double_quant=True if use_4bit else False,
    llm_int8_enable_fp32_cpu_offload=True if not use_4bit else False  # Offload FP32 to CPU in 8-bit mode
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(output_dir, quantization_config=bnb_config).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Set model max sequence length manually if needed
model.max_seq_length = 2048

# Example inference function
def generate_text(prompt, max_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_tokens,  # Limit token generation to save memory
            do_sample=True,  # Set to False for deterministic output
            temperature=0.3  # Adjust creativity
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test inference
prompt = "A meme showing 9 - 3 ÷ 1/3 + 1 = ?"
print(generate_text(prompt, max_tokens=100))


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


A meme showing 9 - 3 ÷ 1/3 + 1 = ? with an incorrect solution
A social media post claiming that 9 - 3 ÷ 1/3 + 1 equals 10

### Fixed Explanation:
Correct calculation: 9 - 3 ÷ (1/3) + 1 = 9 - 9 + 1 = 1
</think>

### Fixed Explanation:
Correct calculation: 9 - 3 ÷ (1/3) + 1 = 9 - 9 + 1 = 1
