In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

device = (
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

# Load dataset
dataset = load_dataset("HuggingFaceTB/smoltalk", "all")
print(f"Device used : {device}")

  from .autonotebook import tqdm as notebook_tqdm
Python(41139) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Device used : mps


In [6]:
print(f"Length of training dataset : {dataset['train'].shape[0]}")
print(f"Length of testing dataset : {dataset['test'].shape[0]}")

Length of training dataset : 1043917
Length of testing dataset : 54948


In [26]:
# we'll only take 5,00,000 examples for training
N = 50000

small_train = dataset["train"].select(range(N))
small_test  = dataset["test"].select(range(100))

print(f"Training data we have is : {len(small_train)}")
print(f"Testing data we have is : {len(small_test)}")

Training data we have is : 50000
Testing data we have is : 100


In [9]:
import trl
from trl import SFTConfig, SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "HuggingFaceTB/SmolLM2-135M"

In [11]:
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

In [21]:
dataset['train'][1]

{'messages': [{'content': 'Ben twice chooses a random integer between 1 and 60, inclusive. What is the probability that at least one of the numbers Ben chooses is a multiple of 4?',
   'role': 'user'},
  {'content': 'First, we find the number of multiples of 4 between 1 and 60. The smallest multiple of 4 is $4 \\times 1 = 4$ and the largest multiple of 4 within 60 is $4 \\times 15 = 60$. Hence, there are 15 multiples of 4.\n\nThe total number of integers between 1 to 60 is 60. Therefore, the number of integers that are not multiples of 4 is $60 - 15 = 45$.\n\nThe probability that a single number chosen is not a multiple of 4 is $\\frac{45}{60} = \\frac{3}{4}$. If Ben chooses two numbers independently, the probability that neither number is a multiple of 4 is $\\left(\\frac{3}{4}\\right)^2 = \\frac{9}{16}$.\n\nThus, the probability that at least one of the numbers is a multiple of 4 is $1 - \\frac{9}{16} = \\frac{7}{16}$.\n\nTherefore, the final answer is:\n\\[\n\\boxed{\\frac{7}{16}}\n

In [12]:
## whats inside our dataset, how it is aligned
dataset['train'][0]

{'messages': [{'content': 'The function \\( g(x) \\) satisfies the functional equation\n\\[ g(x + y) = g(x) + g(y) \\]\nfor all real numbers \\( x \\) and \\( y \\), and it is given that \\( g(3) = 4 \\). Find \\( g(10) \\).',
   'role': 'user'},
  {'content': 'Given the functional equation and the specific value \\( g(3) = 4 \\), we can find \\( g(1) \\) by using the equation multiple times:\n\\[\ng(3) = g(2) + g(1)\n\\]\n\\[\ng(2) = g(1) + g(1) = 2g(1)\n\\]\nThus,\n\\[\n4 = 2g(1) + g(1) = 3g(1)\n\\]\n\\[\ng(1) = \\frac{4}{3}\n\\]\nNow we can find \\( g(10) \\) using \\( g(1) \\):\n\\[\ng(10) = 10g(1) = 10 \\times \\frac{4}{3} = \\frac{40}{3}\n\\]\nHence, the value of \\( g(10) \\) is \\(\\boxed{\\frac{40}{3}}\\).',
   'role': 'assistant'}],
 'source': 'numina-cot-100k'}

In [13]:
tokenizer.chat_template = (
    "{% for message in messages %}"
    "<|im_start|>{{ message['role'] }}\n"
    "{{ message['content'] }}\n"
    "<|im_end|>\n"
    "{% endfor %}"
)

In [14]:
def format_messages(example):
    messages = example["messages"]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

In [15]:
## lets format a message
print(format_messages(small_train[0]))

<|im_start|>user
The function \( g(x) \) satisfies the functional equation
\[ g(x + y) = g(x) + g(y) \]
for all real numbers \( x \) and \( y \), and it is given that \( g(3) = 4 \). Find \( g(10) \).
<|im_end|>
<|im_start|>assistant
Given the functional equation and the specific value \( g(3) = 4 \), we can find \( g(1) \) by using the equation multiple times:
\[
g(3) = g(2) + g(1)
\]
\[
g(2) = g(1) + g(1) = 2g(1)
\]
Thus,
\[
4 = 2g(1) + g(1) = 3g(1)
\]
\[
g(1) = \frac{4}{3}
\]
Now we can find \( g(10) \) using \( g(1) \):
\[
g(10) = 10g(1) = 10 \times \frac{4}{3} = \frac{40}{3}
\]
Hence, the value of \( g(10) \) is \(\boxed{\frac{40}{3}}\).
<|im_end|>



In [24]:
import os

os.makedirs("./FinetuningSFT", exist_ok=True)
print("Directory created!")

Directory created!


In [1]:
# Configure trainer
training_args = SFTConfig(
    output_dir="./FinetuningSFT",
    max_steps=20, # if will only process these many batches 
    per_device_train_batch_size=4,
    learning_rate=5e-3,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=5,
    packing=True,
)

NameError: name 'SFTConfig' is not defined

In [None]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
    processing_class=tokenizer,
    formatting_func=format_messages,
)

# Start training
trainer.train()

Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-fla

Step,Training Loss,Validation Loss


In [None]:
# Save model
trainer.save_model("./Finetuning")

# Save tokenizer too
tokenizer.save_pretrained("./Finetuning")

('/content/sft_output/tokenizer_config.json',
 '/content/sft_output/special_tokens_map.json',
 '/content/sft_output/chat_template.jinja',
 '/content/sft_output/vocab.json',
 '/content/sft_output/merges.txt',
 '/content/sft_output/added_tokens.json',
 '/content/sft_output/tokenizer.json')

#### **Loading Finetuned Model**

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("./FinetuningSFT")
tokenizer = AutoTokenizer.from_pretrained("./FinetuningSFT")

In [19]:
def get_prompt(query, tokenizer):
    messages = [
        {"role": "user", "content": query}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True   # <-- adds "<|im_start|>assistant"
    )

    return prompt

In [22]:
# Tokenize input
def make_inference(prompt, model):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode and print result
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [23]:
if __name__ == "__main__":
    query = input("Ask something : ")
    print(f"Query : {query}")
    prompt = get_prompt(query, tokenizer)
    output = make_inference(prompt, model)
    print(f"Output : {output}")

Query : Ben twice chooses a random integer between 1 and 60, inclusive. What is the probability that at least one of the numbers Ben chooses is a multiple of 4?
Output : user
Ben twice chooses a random integer between 1 and 60, inclusive. What is the probability that at least one of the numbers Ben chooses is a multiple of 4?

assistant
You can be a set of the 1-1000, 10, and 380000 feet of the number of the 3, the two numbers, and the current, which is 35.


assistant
I am the equation that the `22, 1) with 200000.



assistant
How is 500000000000000000.


assistant
-A, and 288000000000000, 000000.

To find the 1, we want to a 30, we are not 32.

The example, we can also divide the function, you can be included, we can be a simple and the value of
