In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
from datasets import load_dataset, concatenate_datasets,  load_from_disk
import peft

# from safetensors.torch import load_model, save_model

import random
import re
import json
import ast
from copy import deepcopy
from enum import Enum

from typing import Optional
from jinja2 import Template
from transformers.utils import get_json_schema

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
SIZE = "360M"
MODEL_PATH = f"HuggingFaceTB/SmolLM2-{SIZE}-Instruct"
LORA_PATH = None
# dataset = load_from_disk("/Users/ohi/Documents/GitHub/PersonalAssistant/dataset")
dataset = None

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    # "Qwen/Qwen2.5-Coder-0.5B-Instruct",
    device_map="cpu",
    low_cpu_mem_usage=True,
    # attn_implementation='sdpa',
    # attn_implementation='eager', # 'flash_attention_2',
    torch_dtype=torch.bfloat16,
    tie_word_embeddings=True,
    trust_remote_code=True,
    use_cache=False
)

# Gradient checkpointing - Could take more memory in MPS
# model.gradient_checkpointing_enable(dict(use_reentrant=False))
model.gradient_checkpointing_disable()
# model.resize_token_embeddings(49162)
print(f"Model took {model.get_memory_footprint()/1e9:.2f} GB of space (with buffer)")

Model took 0.72 GB of space (with buffer)


In [16]:
model = peft.PeftModel.from_pretrained(
   model,
   "/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-360M-sft-r64-old/checkpoint-8700/smolthink",
   is_trainable=False, # 👈 here,
)

In [17]:
print(model)
# 49152

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(49152, 960, padding_idx=2)
          (modules_to_save): ModuleDict(
            (default): Embedding(49152, 960, padding_idx=2)
          )
        )
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=960, out_features=960, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=960, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=960, bias=False)
                )
                (lora_embe

In [23]:
# if lora_r:
lora_r = 64
# SAVE_PATH += f'r{lora_r}'
peft_config = peft.LoraConfig(
    r=lora_r,                   # 64
    lora_alpha=2*lora_r,        # alpha = 4 * r
    lora_dropout=0.05,
    target_modules='all-linear',
    modules_to_save = [
        "embed_tokens", 
        "lm_head"
    ],
    use_rslora=True,
    bias="none",
    task_type="CAUSAL_LM",
    init_lora_weights="gaussian",
    inference_mode=False,
)
model = peft.get_peft_model(model, peft_config, adapter_name="smolthink", autocast_adapter_dtype=False)

# Sanity check
non_lora_param = 0
lora_param = 0
lora_layers = 0
for name, param in model.named_parameters():
    if 'lora' in name:
        # param.requires_grad = True
        assert param.requires_grad == True, f"{name} is not trainable"
        lora_param += param.numel()
        lora_layers += 1
    else:
        # if not param.requires_grad:
        #     print(f"{name} is trainable")
        non_lora_param += param.numel()

    if 'lm_head' in name:
        print("lm_head ->", name, ":", param.requires_grad)
    if 'embed_tokens' in name:
        print("embed_tokens ->", name, ":", param.requires_grad)


def into_million(val):
    return f"{val / 1000 / 1000 :.2f} million"

# print("LoRA adapter added.")
print(f"Total LoRA params: {into_million(lora_param)} ({(lora_param/non_lora_param)*100:.2f} %) = {into_million(lora_param)}")
print(f"Total LoRA layers: {lora_layers}")
print(f"Approx size: {lora_param * 2e-6:.2f} mb")

embed_tokens -> base_model.model.model.embed_tokens.original_module.weight : False
embed_tokens -> base_model.model.model.embed_tokens.modules_to_save.smolthink.weight : True
lm_head -> base_model.model.lm_head.modules_to_save.smolthink.weight : True
Total LoRA params: 34.73 million (7.61 %) = 34.73 million
Total LoRA layers: 448
Approx size: 69.47 mb


In [27]:
model.base_model.model.model.embed_tokens

ModulesToSaveWrapper(
  (original_module): Embedding(49152, 960, padding_idx=2)
  (modules_to_save): ModuleDict(
    (smolthink): Embedding(49152, 960, padding_idx=2)
  )
)

In [None]:
print("Is same weight of embed_tokens and lm_head?", torch.equal(model.base_model.model.model.embed_tokens.modules_to_save["smolthink"].weight, model.base_model.model.lm_head.modules_to_save["smolthink"].weight))
print(model.base_model.model.model.embed_tokens.original_module.weight.data.data_ptr() == model.base_model.model.lm_head.original_module.weight.data.data_ptr())

Is same weight of embed_tokens and lm_head? True
True


In [None]:
model.base_model.model.model.decoder.embed_tokens.modules_to_save["default"].weight = model.base_model.model.lm_head.modules_to_save["default"].weight

In [None]:
model = model.merge_and_unload(safe_merge=True).eval().to(torch.bfloat16)
print(f"Model took {model.get_memory_footprint()/1e9:.2f} GB of space (with buffer)")

Model took 0.72 GB of space (with buffer)


In [25]:
print(sum(p.numel() for p in model.parameters()) / 1e6)

409.00704


In [None]:
embed_tokens -> base_model.model.model.embed_tokens.original_module.weight : False
embed_tokens -> base_model.model.model.embed_tokens.modules_to_save.smolthink.weight : True
lm_head -> base_model.model.lm_head.modules_to_save.smolthink.weight : True
Total LoRA params: 34.73 million (7.61 %) = 34.73 million
Total LoRA layers: 448
Approx size: 69.47 mb
Model took 0.82 GB of space (with buffer)
409.00704

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb)

In [16]:
total_param = 0
for name, param in model.named_parameters():
    print('lora' in name.lower())
    total_param += param.numel()

print(lora_param)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [15]:
chat_template = """{%- if tools %}
    {{- '<|im_start|>system\\n' }}
        {%- if messages[0]['role'] == 'system' %}
            {- messages[0]['content'] }}
        {%- else %}
            {{- 'You are a helpful AI assistant named SmolThink.' }}
        {%- endif %}
    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> tags:\\n<tools>\" }}
    {%- for tool in tools %}
        {{- \"\\n\" }}
            {{- tool | tojson }}
    {%- endfor %}
    {{- \"\\n</tools>\\n\\nYou first think/plan inside <think></think> tags.\\nThen for each function call, return a json object with function name and arguments within <tool_call></tool_call> tags.<|im_end|>\\n\" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}
    {%- else %}
        {{- '<|im_start|>system\\nYou are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside \\'think\\' tag and provide final answer to user query inside \\'answer\\' tag.\\nRespond in the following format:\\n<think>\\nLet\\'s think step by step...\\n</think>\\n<answer>\\nThe final answer is...\\n</answer><|im_end|>\\n' }}
    {%- endif %}
{%- endif %}
{%- for message in messages %}
    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}
        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}
    {%- elif message.role == \"assistant\" %}
        {{- '<|im_start|>' + message.role }}
        {%- if message.content %}
            {{- '\\n' + message.content }}
        {%- endif %}
        {%- for tool_call in message.tool_calls %}
            {%- if tool_call.function is defined %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
            {{- '\\n<tool_call>\\n{\"name\": \"' }}
            {{- tool_call.name }}
            {{- '\", \"arguments\": ' }}
            {{- tool_call.arguments | tojson }}
            {{- '}\\n</tool_call>' }}
        {%- endfor %}
        {{- '<|im_end|>\\n' }}
    {%- elif message.role == \"tool\" %}
        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\\n<tool_response>\\n' }}
        {{- message.content }}
        {{- '\\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}
            {{- '<|im_end|>\\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\\n' }}
{%- endif %}"""

special_tokens_dict = {
    "bos_token": "<|im_start|>",
    "eos_token": "<|im_end|>",
    "pad_token": "<|im_end|>",
    "unk_token": "<|endoftext|>",
    "think_start": "<think>",
    "think_end": "</think>",
    "answer_start": "<answer>",
    "answer_end": "</answer>",
    "tool_def_start": "<tool>",
    "tool_def_end": "</tool>",
    "tool_call_start": "<tool_call>",
    "tool_call_end": "</tool_call>",
    "tool_res_start": "<tool_response>",
    "tool_res_end": "</tool_response>",
}

class SpecialTokens(str, Enum):
    think_start = "<think>",
    think_end = "</think>",
    answer_start = "<answer>",
    answer_end = "</answer>",
    tool_def_start = "<tool>",
    tool_def_end = "</tool>",
    tool_call_start = "<tool_call>",
    tool_call_end = "</tool_call>",
    tool_res_start = "<tool_response>",
    tool_res_end = "</tool_response>",

    @classmethod
    def list(cls):
        return [c.value for c in cls]

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    add_bos_token=True,
    add_eos_token=True,
    # additional_special_tokens=SpecialTokens.list()
)
tokenizer.chat_template = chat_template
# tokenizer.pad_token = tokenizer.eos_token
streamer = TextStreamer(tokenizer, skip_prompt=True)

print("Tokenizer length:", len(tokenizer))
tokenizer.save_pretrained("SmolThink-360M-Tokenizer")


# tokenizer.add_special_tokens(
#     # special_tokens_dict=special_tokens_dict, 
#     {"additional_special_tokens":["<think>"]},
#     replace_additional_special_tokens=True)
# print("Tokenizer length:", len(tokenizer))

# print("New token map")
# for v in SpecialTokens.list():
#     print(v, '->', tokenizer.encode(v))
# print("---")

# print(tokenizer.apply_chat_template([
#     {"role": "user", "content": "How are you?"},
#     {"role": "assistant", "content": "I am fine"}
# ], tokenize=False))

tools = [
    {
        "type": "function",
        "function": {
            "name": "retrieve_payment_status",
            "description": "Get payment status of a transaction",
            "parameters": {
                "type": "object",
                "properties": {
                    "transaction_id": {
                        "type": "string",
                        "description": "The transaction id.",
                    }
                },
                "required": ["transaction_id"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "retrieve_payment_date",
            "description": "Get payment date of a transaction",
            "parameters": {
                "type": "object",
                "properties": {
                    "transaction_id": {
                        "type": "string",
                        "description": "The transaction id.",
                    }
                },
                "required": ["transaction_id"],
            },
        },
    }
]
# print("\n-----\n")
# print(tokenizer.apply_chat_template([
#     {"role": "user", "content": "How are you?"},
#     {"role": "assistant", "content": "<tool_call>[retrieve_payment_date(12)]</tool_call>"},
#     {"role": "tool", "content": "12/12/12"},
#     {"role": "assistant", "content": "12/12/12"}
# ], tools=tools, tokenize=False))

print(tokenizer.encode("<think>"))
print(tokenizer.encode("<|im_start|>"))

Tokenizer length: 49152
[44, 17400, 46]
[1]


In [None]:
def extract_tag(input_str, tag):
    tool_def = re.findall(f"<{tag}>(.*?)</{tag}>", input_str, re.DOTALL)
    tool_def = map(str.strip, tool_def)
    tool_def = filter(lambda x: len(x) > 0, tool_def)
    return list(tool_def)

def hermes_fc_thinking(raw_data):
    data = deepcopy(raw_data['conversations'])
    seq = []
    tool_def = None
    tool_names = None
    for d in data:
        if d['role'] == 'system':
            tool_def = extract_tag(d['content'], 'tools')
            if len(tool_def) != 0:
                try:
                    tool_def = ast.literal_eval(tool_def[0])
                    tool_names = [tool['function']['name'] for tool in tool_def]
                    continue
                except Exception as E:
                    return {"conversations": ""}
            else:
                return {"conversations": ""}

        seq.append({})
        seq[-1]['role'] = {"human": "user", "model": "assistant", "system": "system", "tool": "tool"}[d['role']]
        seq[-1]['content'] = d['content']
        if seq[-1]['role'] == 'assistant':
            seq[-1]['content'] = seq[-1]['content'].replace('<think>', '<think>\n')
            seq[-1]['content'] = seq[-1]['content'].replace('</think>', '</think>\n')
            seq[-1]['content'] = seq[-1]['content'].replace('<tool_call>\n', '<tool_call>\n[')
            seq[-1]['content'] = seq[-1]['content'].replace('\n</tool_call>', ']\n</tool_call>')
            tool_calls = re.findall(r"<tool_call>(.*?)</tool_call>", seq[-1]['content'], re.DOTALL)
            if tool_calls:
                # print(tool_calls, tool_def)
                try:
                    tool_calls = json.loads(tool_calls[0].strip().replace("'", '"'))
                    for tool_call in tool_calls:
                        if tool_call['name'] not in tool_names:
                            raise NotImplementedError
                except Exception as E:
                    return {"conversations": ""}
        if seq[-1]['role'] == 'tool':
            seq[-1]['content'] = seq[-1]['content'].replace("<tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].replace("</tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].strip()
        # seq[-1]['content'] = d['value']
    
    random.shuffle(tool_def)
    ret = tokenizer.apply_chat_template(seq, tools=tool_def, tokenize=False, add_generation_prompt=False) #+ "<tool_call>\n"
    return {"conversations": ret}

fc_dataset = load_dataset("Jofthomas/hermes-function-calling-thinking-V1")['train']
# fc_dataset = fc_dataset.select(range(100))
fc_dataset = fc_dataset.map(hermes_fc_thinking)
fc_dataset = fc_dataset.filter(lambda x: len(x['conversations']) > 0)
print("Function calling dataset length (after filter):", len(fc_dataset))

Map:  23%|██▎       | 822/3570 [00:00<00:00, 4122.39 examples/s]

BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG


Map:  40%|████      | 1434/3570 [00:00<00:00, 4094.12 examples/s]

BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG


Map:  74%|███████▍  | 2638/3570 [00:00<00:00, 4029.64 examples/s]

BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG
BUG


Map: 100%|██████████| 3570/3570 [00:00<00:00, 4057.36 examples/s]


BUG
BUG
BUG
BUG


Filter: 100%|██████████| 3570/3570 [00:00<00:00, 234590.32 examples/s]

Function calling dataset length (after filter): 3497





In [7]:
print(model.cpu().resize_token_embeddings(len(tokenizer)))

Embedding(49162, 960, padding_idx=2)


In [9]:
model.to('mps')

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49162, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb)

In [35]:
new_tokenizer = AutoTokenizer.from_pretrained(
    "/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer",
    add_bos_token=True,
    add_eos_token=True,
    # additional_special_tokens=SpecialTokens.list()
)

In [36]:
new_tokenizer.encode("<think>")

[49152]

In [31]:
tokenizer.save_pretrained("/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer")

('/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/tokenizer_config.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/special_tokens_map.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/vocab.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/merges.txt',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/added_tokens.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/tokenizer.json')

In [29]:
print(tokenizer.pad_token, tokenizer.eos_token)
print(tokenizer.encode("<|im_end|>"))
print(tokenizer.decode([9]))

<|im_end|> <|im_end|>
[2]
<issue_comment>


In [14]:
def length_filter(data, limit):
    # if data['thought_len'] + data['answer_len'] > 896:
        # return False
    return 0 < data['thought_len'] <= limit and 0 < data['answer_len']

In [6]:
def r1distillsft_conv(data):
    thought_len, answer_len = 0, 0
    for idx, conv in enumerate(data['reannotated_messages']):
        # print(conv)
        role = conv['role']
        if role == 'assistant':
            reply = data['reannotated_messages'][idx]['content']
            # print(reply)
            thought = re.findall(r"<think>(.*?)</think>", reply, re.DOTALL)
            thought = ''.join(thought).strip()
            thought_len += len(thought.split()) #len(tokenizer.encode(thought))

            end_tag = "</think>"
            if end_tag in reply:
                answer = reply[reply.find(end_tag)+len(end_tag):]
                answer = answer.strip()
            else:
                answer = ''
            if thought.lower() == answer.lower():
                answer = ''
            # print("Think:", thought)
            # print("Answer:", answer)
            # print("----")
            answer_len += len(answer.split()) #len(tokenizer.encode(answer))
            data['reannotated_messages'][idx]['content'] = f"<think>\n{thought}\n</think>\n<answer>\n{answer}\n</answer>"

    if 'system' in data:
        del data['system']
    data['thought_len'] = thought_len
    data['answer_len'] = answer_len
    return data

r1_dataset = load_dataset("ServiceNow-AI/R1-Distill-SFT", "v1")['train']
r1_dataset.shuffle(123)
r1_dataset = r1_dataset.select(range(90_000))
r1_dataset = r1_dataset.map(r1distillsft_conv)
r1_dataset = r1_dataset.filter(lambda x: length_filter(x, 256))
delete_keys = list(r1_dataset.column_names)
r1_dataset = r1_dataset.map(lambda x: {"conversations": tokenizer.apply_chat_template(x['reannotated_messages'], tools=None, tokenize=False)})
r1_dataset = r1_dataset.remove_columns(delete_keys)
print("R1-distill dataset length (after filter):", len(r1_dataset))

R1-distill dataset length (after filter): 29290


In [9]:
long, short = 0, 0
for idx, d in enumerate(r1_dataset):
    
    # print(d['conversations'])
#     s = d['conversations'].replace('''<|im_start|>system
# You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
# Respond in the following format:
# <think>
# Let's think step by step...
# </think>
# <answer>
# The final answer is...
# </answer><|im_end|>''', '')
#     think_cnt = s.count("<think>")
#     ans_cnt = s.count("<answer>")

#     # print(s)
#     if think_cnt > 1 or ans_cnt > 1:
#         print("Check idx:", idx)
#         break
    # break

    en = tokenizer.encode(d['conversations'])
    if len(en) > 832:
        # print(idx, len(en))
        long += 1
    else:
        short += 1

print(long, short)

11770 17520


In [10]:
long, short = 0, 0
for idx, d in enumerate(r1_dataset):
    
    # print(d['conversations'])
#     s = d['conversations'].replace('''<|im_start|>system
# You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
# Respond in the following format:
# <think>
# Let's think step by step...
# </think>
# <answer>
# The final answer is...
# </answer><|im_end|>''', '')
#     think_cnt = s.count("<think>")
#     ans_cnt = s.count("<answer>")

#     # print(s)
#     if think_cnt > 1 or ans_cnt > 1:
#         print("Check idx:", idx)
#         break
    # break

    en = tokenizer.encode(d['conversations'])
    if len(en) > 1024:
        # print(idx, len(en))
        long += 1
    else:
        short += 1

print(long, short)

4973 24317


In [None]:
# 48026

In [13]:
print(d['conversations'])

<|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
Respond in the following format:
<think>
Let's think step by step...
</think>
<answer>
The final answer is...
</answer><|im_end|>
<|im_start|>user
Michael and Thomas are selling their lego collections. They agree to split any money they earn. They sell them based on how many circles are on top. Each circle costs 1 cent. They sold a certain number of single pieces, 45 double pieces, 50 triple pieces and 165 quadruple pieces. They earned $5 each. How many single pieces did they sell?<|im_end|>
<|im_start|>assistant
<think>
First, I need to determine the total amount of money Michael and Thomas earned together. Since each earned $5 and they split the money equally, the total earnings amount to $10.

Next, I'll calculate the earnings from the double, triple, and quadruple pieces separately.

For the double pieces, 

In [12]:
def extract_tag(input_str, tag):
    tool_def = re.findall(f"<{tag}>(.*?)</{tag}>", input_str, re.DOTALL)
    tool_def = map(str.strip, tool_def)
    tool_def = filter(lambda x: len(x) > 0, tool_def)
    return list(tool_def)

def hermes_fc_thinking(raw_data):
    data = deepcopy(raw_data['conversations'])
    seq = []
    tool_def = None
    tool_names = None
    for d in data:
        if d['role'] == 'system':
            tool_def = extract_tag(d['content'], 'tools')
            if len(tool_def) != 0:
                try:
                    tool_def = ast.literal_eval(tool_def[0])
                    tool_names = [tool['function']['name'] for tool in tool_def]
                    continue
                except Exception as E:
                    return {"conversations": ""}
            else:
                return {"conversations": ""}

        seq.append({})
        seq[-1]['role'] = {"human": "user", "model": "assistant", "system": "system", "tool": "tool"}[d['role']]
        seq[-1]['content'] = d['content']
        if seq[-1]['role'] == 'assistant':
            seq[-1]['content'] = seq[-1]['content'].replace('<think>', '<think>\n')
            seq[-1]['content'] = seq[-1]['content'].replace('</think>', '</think>\n')
            # seq[-1]['content'] = seq[-1]['content'].replace('<tool_call>\n', '<tool_call>\n[')
            # seq[-1]['content'] = seq[-1]['content'].replace('\n</tool_call>', ']\n</tool_call>')
            tool_calls = re.findall(r"<tool_call>(.*?)</tool_call>", seq[-1]['content'], re.DOTALL)
            seq[-1]['tool-call'] = []
            if tool_calls:
                # print(tool_calls, tool_def)
                for tool_call in tool_calls:
                    try:
                        tool_call = json.loads(tool_call.strip().replace("'", '"'))
                        if tool_call['name'] not in tool_names:
                            raise NotImplementedError
                        seq[-1]['tool_call'] = tool_call
                    except Exception as E:
                        return {"conversations": ""}
        if seq[-1]['role'] == 'tool':
            seq[-1]['content'] = seq[-1]['content'].replace("<tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].replace("</tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].strip()
        # seq[-1]['content'] = d['value']
    
    random.shuffle(tool_def)
    ret = tokenizer.apply_chat_template(seq, tools=tool_def, tokenize=False, add_generation_prompt=False) #+ "<tool_call>\n"
    return {"conversations": ret}

fc_dataset = load_dataset("Jofthomas/hermes-function-calling-thinking-V1")['train']
# fc_dataset = fc_dataset.select(range(100))
fc_dataset = fc_dataset.map(hermes_fc_thinking)
fc_dataset = fc_dataset.filter(lambda x: len(x['conversations']) > 0)
print("Function calling dataset length (after filter):", len(fc_dataset))

Function calling dataset length (after filter): 3497


In [13]:
print(fc_dataset[0]['conversations'])

<|im_start|>system
You are a helpful AI assistant named SmolThink.

# Tools

You may call one or more functions to assist with the user query.

You are provided with function signatures within <tools></tools> tags:
<tools>
{"type": "function", "function": {"name": "get_stock_price", "description": "Get the current stock price of a company", "parameters": {"type": "object", "properties": {"company": {"type": "string", "description": "The name of the company"}}, "required": ["company"]}}}
{"type": "function", "function": {"name": "get_movie_details", "description": "Get details about a movie", "parameters": {"type": "object", "properties": {"title": {"type": "string", "description": "The title of the movie"}}, "required": ["title"]}}}
</tools>

You first think/plan inside <think></think> tags.
Then for each function call, return a json object with function name and arguments within <tool_call></tool_call> tags.<|im_end|>
<|im_start|>user
Hi, can you tell me the current stock price of App

In [4]:
ds = load_from_disk("/Users/ohi/Documents/GitHub/PersonalAssistant/dataset")

In [5]:
ds

Dataset({
    features: ['instruction', 'input', 'output', 'thought_len', 'answer_len', 'conversations'],
    num_rows: 34125
})

In [11]:
for d in ds:
    conv = d['conversations']
    if "</thoughts>" in conv:
        print(conv)
        # break

<|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside <think></think> tag and provide final answer to user query inside <answer></answer> tag.<|im_end|>
<|im_start|>user
How can I use CSS preprocessors like Sass or Less to write more efficient and modular CSS code?<|im_end|>
<|im_start|>assistant
<think>
Before organizing my code, I should consider the following to avoid mistakes:
    *   Make sure to create a clear directory structure to keep related files together.
    *   Use a consistent naming convention for your files and variables to avoid confusion.
    *   Be mindful of the file path and import statements to avoid errors.
    
    Before defining variables, I should consider the following to avoid mistakes:
    *   Make sure to use a consistent naming convention for variables to avoid confusion.
    *   Avoid using magic numbers or hardcoded values, as they can be difficult to maintain.
    *   Consider using a preprocessor-s

In [17]:
def openthought_code(data):
    thought_len, answer_len = 0, 0

    reply = data['output']
    thought = re.findall(r"<thoughts>(.*?)</thoughts>", reply, re.DOTALL)
    if len(thought) > 1:
        print(thought)
    thought = ''.join(thought).strip()
    thought_len += len(thought.split())
    
    end_tag = "</thoughts>"
    answer = reply[reply.find(end_tag)+len(end_tag):]
    answer = answer.strip()
    answer_len += len(answer.split()) #len(tokenizer.encode(answer))

    if end_tag not in reply:
        answer_len = 0

    final_answer = f"<think>\n{thought}\n</think>\n<answer>\n{answer}\n</answer>"
    final_answer = final_answer.replace("<thoughts>", "").replace("</thoughts>", "")
    
    output_data = {
        'thought_len': thought_len,
        'answer_len': answer_len,
        'conversations': [
            {"role": "user", 'content': data['input']},
            {"role": "assistant", 'content': final_answer}
        ]
    }

    return output_data

openthought_dataset = load_dataset("XeTute/Open-Coding-Thoughts")['train']
print("Dataset length:", len(openthought_dataset))
openthought_dataset = openthought_dataset.map(openthought_code)
openthought_dataset = openthought_dataset.map(lambda x: {"conversations": tokenizer.apply_chat_template(x['conversations'], tools=None, tokenize=False)})
print("OpenThought dataset length (after filter):", len(openthought_dataset))
# print(openthought_dataset[0]['conversations'])

Using the latest cached version of the dataset since XeTute/Open-Coding-Thoughts couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/ohi/.cache/huggingface/datasets/XeTute___open-coding-thoughts/default/0.0.0/b63d89b184b9048a18e2cd42be298db6e44ab255 (last modified on Sun Mar 16 12:25:21 2025).


Dataset length: 1025


Map: 100%|██████████| 1025/1025 [00:00<00:00, 15778.30 examples/s]


['\n    Before organizing my code, I should consider the following to avoid mistakes:\n    *   Make sure to create a clear directory structure to keep related files together.\n    *   Use a consistent naming convention for your files and variables to avoid confusion.\n    *   Be mindful of the file path and import statements to avoid errors.\n    ', "\n    Before defining variables, I should consider the following to avoid mistakes:\n    *   Make sure to use a consistent naming convention for variables to avoid confusion.\n    *   Avoid using magic numbers or hardcoded values, as they can be difficult to maintain.\n    *   Consider using a preprocessor-specific syntax for variables, such as Sass's `$` symbol or Less's `@` symbol.\n    ", "\n    Before creating mixins and functions, I should consider the following to avoid mistakes:\n    *   Make sure to use a consistent naming convention for mixins and functions to avoid confusion.\n    *   Avoid over-engineering and keep the code simp

Map: 100%|██████████| 1025/1025 [00:00<00:00, 13955.78 examples/s]

OpenThought dataset length (after filter): 1025



