In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
from datasets import load_dataset, concatenate_datasets,  load_from_disk
import peft

# from safetensors.torch import load_model, save_model

import random
import re
import json
import ast
from copy import deepcopy
from enum import Enum

from typing import Optional
from jinja2 import Template
from transformers.utils import get_json_schema

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SIZE = "360M"
MODEL_PATH = f"HuggingFaceTB/SmolLM2-{SIZE}-Instruct"
LORA_PATH = None
# dataset = load_from_disk("/Users/ohi/Documents/GitHub/PersonalAssistant/dataset")
dataset = None

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    # "Qwen/Qwen2.5-Coder-0.5B-Instruct",
    device_map="cpu",
    low_cpu_mem_usage=True,
    # attn_implementation='sdpa',
    # attn_implementation='eager', # 'flash_attention_2',
    torch_dtype=torch.bfloat16,
    tie_word_embeddings=True,
    trust_remote_code=True,
    use_cache=False
)

# Gradient checkpointing - Could take more memory in MPS
# model.gradient_checkpointing_enable(dict(use_reentrant=False))
model.gradient_checkpointing_disable()
# model.resize_token_embeddings(49162)
print(f"Model took {model.get_memory_footprint()/1e9:.2f} GB of space (with buffer)")

Model took 0.72 GB of space (with buffer)


In [16]:
model = peft.PeftModel.from_pretrained(
   model,
   "/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-360M-sft-r64-old/checkpoint-8700/smolthink",
   is_trainable=False, # 👈 here,
)

In [17]:
print(model)
# 49152

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(49152, 960, padding_idx=2)
          (modules_to_save): ModuleDict(
            (default): Embedding(49152, 960, padding_idx=2)
          )
        )
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=960, out_features=960, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=960, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=960, bias=False)
                )
                (lora_embe

In [23]:
# if lora_r:
lora_r = 64
# SAVE_PATH += f'r{lora_r}'
peft_config = peft.LoraConfig(
    r=lora_r,                   # 64
    lora_alpha=2*lora_r,        # alpha = 4 * r
    lora_dropout=0.05,
    target_modules='all-linear',
    modules_to_save = [
        "embed_tokens", 
        "lm_head"
    ],
    use_rslora=True,
    bias="none",
    task_type="CAUSAL_LM",
    init_lora_weights="gaussian",
    inference_mode=False,
)
model = peft.get_peft_model(model, peft_config, adapter_name="smolthink", autocast_adapter_dtype=False)

# Sanity check
non_lora_param = 0
lora_param = 0
lora_layers = 0
for name, param in model.named_parameters():
    if 'lora' in name:
        # param.requires_grad = True
        assert param.requires_grad == True, f"{name} is not trainable"
        lora_param += param.numel()
        lora_layers += 1
    else:
        # if not param.requires_grad:
        #     print(f"{name} is trainable")
        non_lora_param += param.numel()

    if 'lm_head' in name:
        print("lm_head ->", name, ":", param.requires_grad)
    if 'embed_tokens' in name:
        print("embed_tokens ->", name, ":", param.requires_grad)


def into_million(val):
    return f"{val / 1000 / 1000 :.2f} million"

# print("LoRA adapter added.")
print(f"Total LoRA params: {into_million(lora_param)} ({(lora_param/non_lora_param)*100:.2f} %) = {into_million(lora_param)}")
print(f"Total LoRA layers: {lora_layers}")
print(f"Approx size: {lora_param * 2e-6:.2f} mb")

embed_tokens -> base_model.model.model.embed_tokens.original_module.weight : False
embed_tokens -> base_model.model.model.embed_tokens.modules_to_save.smolthink.weight : True
lm_head -> base_model.model.lm_head.modules_to_save.smolthink.weight : True
Total LoRA params: 34.73 million (7.61 %) = 34.73 million
Total LoRA layers: 448
Approx size: 69.47 mb


In [27]:
model.base_model.model.model.embed_tokens

ModulesToSaveWrapper(
  (original_module): Embedding(49152, 960, padding_idx=2)
  (modules_to_save): ModuleDict(
    (smolthink): Embedding(49152, 960, padding_idx=2)
  )
)

In [None]:
print("Is same weight of embed_tokens and lm_head?", torch.equal(model.base_model.model.model.embed_tokens.modules_to_save["smolthink"].weight, model.base_model.model.lm_head.modules_to_save["smolthink"].weight))
print(model.base_model.model.model.embed_tokens.original_module.weight.data.data_ptr() == model.base_model.model.lm_head.original_module.weight.data.data_ptr())

Is same weight of embed_tokens and lm_head? True
True


In [None]:
model.base_model.model.model.decoder.embed_tokens.modules_to_save["default"].weight = model.base_model.model.lm_head.modules_to_save["default"].weight

In [None]:
model = model.merge_and_unload(safe_merge=True).eval().to(torch.bfloat16)
print(f"Model took {model.get_memory_footprint()/1e9:.2f} GB of space (with buffer)")

Model took 0.72 GB of space (with buffer)


In [25]:
print(sum(p.numel() for p in model.parameters()) / 1e6)

409.00704


In [None]:
embed_tokens -> base_model.model.model.embed_tokens.original_module.weight : False
embed_tokens -> base_model.model.model.embed_tokens.modules_to_save.smolthink.weight : True
lm_head -> base_model.model.lm_head.modules_to_save.smolthink.weight : True
Total LoRA params: 34.73 million (7.61 %) = 34.73 million
Total LoRA layers: 448
Approx size: 69.47 mb
Model took 0.82 GB of space (with buffer)
409.00704

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb)

In [16]:
total_param = 0
for name, param in model.named_parameters():
    print('lora' in name.lower())
    total_param += param.numel()

print(lora_param)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [5]:
chat_template = """{%- if tools %}
    {{- '<|im_start|>system\\n' }}
        {%- if messages[0]['role'] == 'system' %}
            {- messages[0]['content'] }}
        {%- else %}
            {{- 'You are a helpful AI assistant named SmolThink.' }}
        {%- endif %}
    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> tags:\\n<tools>\" }}
    {%- for tool in tools %}
        {{- \"\\n\" }}
            {{- tool | tojson }}
    {%- endfor %}
    {{- \"\\n</tools>\\n\\nYou first think/plan inside <think></think> tags.\\nThen for each function call, return a json object with function name and arguments within <tool_call></tool_call> tags.<|im_end|>\\n\" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}
    {%- else %}
        {{- '<|im_start|>system\\nYou are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside \\'think\\' tag and provide final answer to user query inside \\'answer\\' tag.\\nRespond in the following format:\\n<think>\\nLet\\'s think step by step...\\n</think>\\n<answer>\\nThe final answer is...\\n</answer><|im_end|>\\n' }}
    {%- endif %}
{%- endif %}
{%- for message in messages %}
    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}
        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}
    {%- elif message.role == \"assistant\" %}
        {{- '<|im_start|>' + message.role }}
        {%- if message.content %}
            {{- '\\n' + message.content }}
        {%- endif %}
        {%- for tool_call in message.tool_calls %}
            {%- if tool_call.function is defined %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
            {{- '\\n<tool_call>\\n{\"name\": \"' }}
            {{- tool_call.name }}
            {{- '\", \"arguments\": ' }}
            {{- tool_call.arguments | tojson }}
            {{- '}\\n</tool_call>' }}
        {%- endfor %}
        {{- '<|im_end|>\\n' }}
    {%- elif message.role == \"tool\" %}
        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\\n<tool_response>\\n' }}
        {{- message.content }}
        {{- '\\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}
            {{- '<|im_end|>\\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\\n' }}
{%- endif %}"""

special_tokens_dict = {
    "bos_token": "<|im_start|>",
    "eos_token": "<|im_end|>",
    "pad_token": "<|im_end|>",
    "unk_token": "<|endoftext|>",
    "think_start": "<think>",
    "think_end": "</think>",
    "answer_start": "<answer>",
    "answer_end": "</answer>",
    "tool_def_start": "<tool>",
    "tool_def_end": "</tool>",
    "tool_call_start": "<tool_call>",
    "tool_call_end": "</tool_call>",
    "tool_res_start": "<tool_response>",
    "tool_res_end": "</tool_response>",
}

class SpecialTokens(str, Enum):
    think_start = "<think>",
    think_end = "</think>",
    answer_start = "<answer>",
    answer_end = "</answer>",
    tool_def_start = "<tool>",
    tool_def_end = "</tool>",
    tool_call_start = "<tool_call>",
    tool_call_end = "</tool_call>",
    tool_res_start = "<tool_response>",
    tool_res_end = "</tool_response>",

    @classmethod
    def list(cls):
        return [c.value for c in cls]

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    add_bos_token=True,
    add_eos_token=True,
    # additional_special_tokens=SpecialTokens.list()
)
tokenizer.chat_template = chat_template
# tokenizer.pad_token = tokenizer.eos_token
streamer = TextStreamer(tokenizer, skip_prompt=True)

print("Tokenizer length:", len(tokenizer))
tokenizer.save_pretrained("SmolThink-360M-Tokenizer")


# tokenizer.add_special_tokens(
#     # special_tokens_dict=special_tokens_dict, 
#     {"additional_special_tokens":["<think>"]},
#     replace_additional_special_tokens=True)
# print("Tokenizer length:", len(tokenizer))

# print("New token map")
# for v in SpecialTokens.list():
#     print(v, '->', tokenizer.encode(v))
# print("---")

# print(tokenizer.apply_chat_template([
#     {"role": "user", "content": "How are you?"},
#     {"role": "assistant", "content": "I am fine"}
# ], tokenize=False))

tools = [
    {
        "type": "function",
        "function": {
            "name": "retrieve_payment_status",
            "description": "Get payment status of a transaction",
            "parameters": {
                "type": "object",
                "properties": {
                    "transaction_id": {
                        "type": "string",
                        "description": "The transaction id.",
                    }
                },
                "required": ["transaction_id"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "retrieve_payment_date",
            "description": "Get payment date of a transaction",
            "parameters": {
                "type": "object",
                "properties": {
                    "transaction_id": {
                        "type": "string",
                        "description": "The transaction id.",
                    }
                },
                "required": ["transaction_id"],
            },
        },
    }
]
# print("\n-----\n")
# print(tokenizer.apply_chat_template([
#     {"role": "user", "content": "How are you?"},
#     {"role": "assistant", "content": "<tool_call>[retrieve_payment_date(12)]</tool_call>"},
#     {"role": "tool", "content": "12/12/12"},
#     {"role": "assistant", "content": "12/12/12"}
# ], tools=tools, tokenize=False))

print(tokenizer.encode("<think>"))
print(tokenizer.encode("<|im_start|>"))

Tokenizer length: 49152
[44, 17400, 46]
[1]


In [7]:
def extract_tag(input_str, tag):
    tool_def = re.findall(f"<{tag}>(.*?)</{tag}>", input_str, re.DOTALL)
    tool_def = map(str.strip, tool_def)
    tool_def = filter(lambda x: len(x) > 0, tool_def)
    return list(tool_def)

def hermes_fc_thinking(raw_data):
    data = deepcopy(raw_data['conversations'])
    seq = []
    tool_def = None
    tool_names = None
    for d in data:
        if d['role'] == 'system':
            tool_def = extract_tag(d['content'], 'tools')
            if len(tool_def) != 0:
                try:
                    tool_def = ast.literal_eval(tool_def[0])
                    tool_names = [tool['function']['name'] for tool in tool_def]
                    continue
                except Exception as E:
                    return {"conversations": ""}
            else:
                return {"conversations": ""}

        seq.append({})
        seq[-1]['role'] = {"human": "user", "model": "assistant", "system": "system", "tool": "tool"}[d['role']]
        seq[-1]['content'] = d['content']
        if seq[-1]['role'] == 'assistant':
            seq[-1]['content'] = seq[-1]['content'].replace('<think>', '<think>\n')
            seq[-1]['content'] = seq[-1]['content'].replace('</think>', '</think>\n')
            seq[-1]['content'] = seq[-1]['content'].replace('<tool_call>\n', '<tool_call>\n[')
            seq[-1]['content'] = seq[-1]['content'].replace('\n</tool_call>', ']\n</tool_call>')
            tool_calls = re.findall(r"<tool_call>(.*?)</tool_call>", seq[-1]['content'], re.DOTALL)
            if tool_calls:
                # print(tool_calls, tool_def)
                try:
                    tool_calls = json.loads(tool_calls[0].strip().replace("'", '"'))
                    for tool_call in tool_calls:
                        if tool_call['name'] not in tool_names:
                            raise NotImplementedError
                except Exception as E:
                    return {"conversations": ""}
        if seq[-1]['role'] == 'tool':
            seq[-1]['content'] = seq[-1]['content'].replace("<tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].replace("</tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].strip()
        # seq[-1]['content'] = d['value']
    
    random.shuffle(tool_def)
    print(seq)

    raise ValueError
    ret = tokenizer.apply_chat_template(seq, tools=tool_def, tokenize=False, add_generation_prompt=False) #+ "<tool_call>\n"
    return {"conversations": ret}

fc_dataset = load_dataset("Jofthomas/hermes-function-calling-thinking-V1")['train']
# fc_dataset = fc_dataset.select(range(100))
fc_dataset = fc_dataset.map(hermes_fc_thinking)
fc_dataset = fc_dataset.filter(lambda x: len(x['conversations']) > 0)
print("Function calling dataset length (after filter):", len(fc_dataset))

Map:   0%|          | 0/3570 [00:00<?, ? examples/s]

[{'role': 'user', 'content': 'Hi, can you tell me the current stock price of Apple?'}, {'role': 'assistant', 'content': "<think>\nAlright, the user asked for the current stock price of Apple. Looking at the tools available, there's a function called get_stock_price which exactly fits this request. I'll need to call this function with the parameter company set to 'Apple' to retrieve the information the user is asking for. This seems like the most straightforward and appropriate action to take.\n</think>\n<tool_call>\n[{'name': 'get_stock_price', 'arguments': {'company': 'Apple'}}]\n</tool_call>"}, {'role': 'tool', 'content': "{'stock_price': '$150.75'}"}, {'role': 'assistant', 'content': 'The current stock price of Apple is $150.75.'}, {'role': 'user', 'content': "That's great. Can you also tell me the stock price for Microsoft?"}, {'role': 'assistant', 'content': "<think>\nAlright, let me try to figure out how to handle this user's request. The user first asked about Apple's stock pric




ValueError: 

In [7]:
print(model.cpu().resize_token_embeddings(len(tokenizer)))

Embedding(49162, 960, padding_idx=2)


In [9]:
model.to('mps')

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49162, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb)

In [35]:
new_tokenizer = AutoTokenizer.from_pretrained(
    "/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer",
    add_bos_token=True,
    add_eos_token=True,
    # additional_special_tokens=SpecialTokens.list()
)

In [36]:
new_tokenizer.encode("<think>")

[49152]

In [31]:
tokenizer.save_pretrained("/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer")

('/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/tokenizer_config.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/special_tokens_map.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/vocab.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/merges.txt',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/added_tokens.json',
 '/Users/ohi/Documents/GitHub/PersonalAssistant/SmolThink-Tokenizer/tokenizer.json')

In [29]:
print(tokenizer.pad_token, tokenizer.eos_token)
print(tokenizer.encode("<|im_end|>"))
print(tokenizer.decode([9]))

<|im_end|> <|im_end|>
[2]
<issue_comment>


In [5]:
def length_filter(data, limit):
    # if data['thought_len'] + data['answer_len'] > 896:
        # return False
    return 0 < data['thought_len'] <= limit and 0 < data['answer_len']

In [8]:
def r1distillsft_conv(data):
    thought_len, answer_len = 0, 0
    for idx, conv in enumerate(data['reannotated_messages']):
        # print(conv)
        role = conv['role']
        if role == 'assistant':
            reply = data['reannotated_messages'][idx]['content']
            # print(reply)
            thought = re.findall(r"<think>(.*?)</think>", reply, re.DOTALL)
            thought = ''.join(thought).strip()
            thought_len += len(thought.split()) #len(tokenizer.encode(thought))

            end_tag = "</think>"
            if end_tag in reply:
                answer = reply[reply.find(end_tag)+len(end_tag):]
                answer = answer.strip()
            else:
                answer = ''
            if thought.lower() == answer.lower():
                answer = ''
            # print("Think:", thought)
            # print("Answer:", answer)
            # print("----")
            answer_len += len(answer.split()) #len(tokenizer.encode(answer))
            data['reannotated_messages'][idx]['content'] = f"<think>\n{thought}\n</think>\n<answer>\n{answer}\n</answer>"

    if 'system' in data:
        del data['system']
    data['thought_len'] = thought_len
    data['answer_len'] = answer_len
    return data

r1_dataset = load_dataset("ServiceNow-AI/R1-Distill-SFT", "v1")['train']
r1_dataset.shuffle(123)
r1_dataset = r1_dataset.select(range(90_000))
r1_dataset = r1_dataset.map(r1distillsft_conv)
r1_dataset = r1_dataset.filter(lambda x: length_filter(x, 256))
delete_keys = list(r1_dataset.column_names)
r1_dataset = r1_dataset.map(lambda x: {"conversations": tokenizer.apply_chat_template(x['reannotated_messages'], tools=None, tokenize=False)})
r1_dataset = r1_dataset.remove_columns(delete_keys)
print("R1-distill dataset length (after filter):", len(r1_dataset))

R1-distill dataset length (after filter): 29290


In [13]:
from tqdm import tqdm

class DatasetGen_v1(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.cache = None
        self.cache_idx = -1
        self.cache_len = 0
        self.indices = []
        self._get_len()

    def _get_len(self):
        print("Computing dataset length")
        for idx in tqdm(range(len(self.dataset))):
            self.gen(idx)
            for i in range(self.cache_len):
                self.indices.append((idx, i))
        print("Total length of data:", len(self.indices))
    

    def __len__(self):
        return len(self.indices)
    
    def gen(self, idx):
        self.cache = self.dataset[idx]['conversations'].rstrip()
        self.cache = self.tokenizer(
            self.cache,
            max_length=CONTEXT_LEN,
            truncation=True,
            return_overflowing_tokens=True, # Return the overflowing tokens
            stride=CONTEXT_LEN // 8,
            padding='max_length'
        )
        self.cache_idx = idx
        self.cache_len = len(self.cache['input_ids'])
    
    def __getitem__(self, idx):
        p, q = self.indices[idx]
        if self.cache_idx != p:
            self.gen(p)
        
        input_ids = self.cache['input_ids'][q]
        attention_mask = self.cache['attention_mask'][q]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }


CONTEXT_LEN = 832 # 1024
train_ds = DatasetGen_v1(
    dataset=r1_dataset.select(range(10)), 
    tokenizer=tokenizer
)

Computing dataset length


100%|██████████| 10/10 [00:00<00:00, 666.19it/s]

Total length of data: 10





In [17]:
for (t, m) in zip(train_ds[0]['input_ids'], train_ds[0]['attention_mask']):
    if m == 1:
        print(tokenizer.decode(t), end='')

<|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
Respond in the following format:
<think>
Let's think step by step...
</think>
<answer>
The final answer is...
</answer><|im_end|>
<|im_start|>user
There were 27 boys and 35 girls on the playground at recess. There were _____ children on the playground at recess.<|im_end|>
<|im_start|>assistant
<think>
First, I need to determine the total number of children on the playground by adding the number of boys and girls.

There are 27 boys and 35 girls.

Adding these together: 27 boys + 35 girls = 62 children.

Therefore, the total number of children on the playground is 62.
</think>
<answer>
To find the total number of children on the playground, we simply add the number of boys and girls together.

\[
\text{Total children} = \text{Number of boys} + \text{Number of girls}
\]

Plugging in the given values:

\[
\text{To

In [9]:
long, short = 0, 0
for idx, d in enumerate(r1_dataset):
    
    # print(d['conversations'])
#     s = d['conversations'].replace('''<|im_start|>system
# You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
# Respond in the following format:
# <think>
# Let's think step by step...
# </think>
# <answer>
# The final answer is...
# </answer><|im_end|>''', '')
#     think_cnt = s.count("<think>")
#     ans_cnt = s.count("<answer>")

#     # print(s)
#     if think_cnt > 1 or ans_cnt > 1:
#         print("Check idx:", idx)
#         break
    # break

    en = tokenizer.encode(d['conversations'])
    if len(en) > 832:
        # print(idx, len(en))
        long += 1
    else:
        short += 1

print(long, short)

11770 17520


In [10]:
long, short = 0, 0
for idx, d in enumerate(r1_dataset):
    
    # print(d['conversations'])
#     s = d['conversations'].replace('''<|im_start|>system
# You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
# Respond in the following format:
# <think>
# Let's think step by step...
# </think>
# <answer>
# The final answer is...
# </answer><|im_end|>''', '')
#     think_cnt = s.count("<think>")
#     ans_cnt = s.count("<answer>")

#     # print(s)
#     if think_cnt > 1 or ans_cnt > 1:
#         print("Check idx:", idx)
#         break
    # break

    en = tokenizer.encode(d['conversations'])
    if len(en) > 1024:
        # print(idx, len(en))
        long += 1
    else:
        short += 1

print(long, short)

4973 24317


In [None]:
# 48026

In [13]:
print(d['conversations'])

<|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
Respond in the following format:
<think>
Let's think step by step...
</think>
<answer>
The final answer is...
</answer><|im_end|>
<|im_start|>user
Michael and Thomas are selling their lego collections. They agree to split any money they earn. They sell them based on how many circles are on top. Each circle costs 1 cent. They sold a certain number of single pieces, 45 double pieces, 50 triple pieces and 165 quadruple pieces. They earned $5 each. How many single pieces did they sell?<|im_end|>
<|im_start|>assistant
<think>
First, I need to determine the total amount of money Michael and Thomas earned together. Since each earned $5 and they split the money equally, the total earnings amount to $10.

Next, I'll calculate the earnings from the double, triple, and quadruple pieces separately.

For the double pieces, 

In [12]:
def extract_tag(input_str, tag):
    tool_def = re.findall(f"<{tag}>(.*?)</{tag}>", input_str, re.DOTALL)
    tool_def = map(str.strip, tool_def)
    tool_def = filter(lambda x: len(x) > 0, tool_def)
    return list(tool_def)

def hermes_fc_thinking(raw_data):
    data = deepcopy(raw_data['conversations'])
    seq = []
    tool_def = None
    tool_names = None
    for d in data:
        if d['role'] == 'system':
            tool_def = extract_tag(d['content'], 'tools')
            if len(tool_def) != 0:
                try:
                    tool_def = ast.literal_eval(tool_def[0])
                    tool_names = [tool['function']['name'] for tool in tool_def]
                    continue
                except Exception as E:
                    return {"conversations": ""}
            else:
                return {"conversations": ""}

        seq.append({})
        seq[-1]['role'] = {"human": "user", "model": "assistant", "system": "system", "tool": "tool"}[d['role']]
        seq[-1]['content'] = d['content']
        if seq[-1]['role'] == 'assistant':
            seq[-1]['content'] = seq[-1]['content'].replace('<think>', '<think>\n')
            seq[-1]['content'] = seq[-1]['content'].replace('</think>', '</think>\n')
            # seq[-1]['content'] = seq[-1]['content'].replace('<tool_call>\n', '<tool_call>\n[')
            # seq[-1]['content'] = seq[-1]['content'].replace('\n</tool_call>', ']\n</tool_call>')
            tool_calls = re.findall(r"<tool_call>(.*?)</tool_call>", seq[-1]['content'], re.DOTALL)
            seq[-1]['tool-call'] = []
            if tool_calls:
                # print(tool_calls, tool_def)
                for tool_call in tool_calls:
                    try:
                        tool_call = json.loads(tool_call.strip().replace("'", '"'))
                        if tool_call['name'] not in tool_names:
                            raise NotImplementedError
                        seq[-1]['tool_call'] = tool_call
                    except Exception as E:
                        return {"conversations": ""}
        if seq[-1]['role'] == 'tool':
            seq[-1]['content'] = seq[-1]['content'].replace("<tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].replace("</tool_response>", "")
            seq[-1]['content'] = seq[-1]['content'].strip()
        # seq[-1]['content'] = d['value']
    
    random.shuffle(tool_def)
    ret = tokenizer.apply_chat_template(seq, tools=tool_def, tokenize=False, add_generation_prompt=False) #+ "<tool_call>\n"
    return {"conversations": ret}

fc_dataset = load_dataset("Jofthomas/hermes-function-calling-thinking-V1")['train']
# fc_dataset = fc_dataset.select(range(100))
fc_dataset = fc_dataset.map(hermes_fc_thinking)
fc_dataset = fc_dataset.filter(lambda x: len(x['conversations']) > 0)
print("Function calling dataset length (after filter):", len(fc_dataset))

Function calling dataset length (after filter): 3497


In [13]:
print(fc_dataset[0]['conversations'])

<|im_start|>system
You are a helpful AI assistant named SmolThink.

# Tools

You may call one or more functions to assist with the user query.

You are provided with function signatures within <tools></tools> tags:
<tools>
{"type": "function", "function": {"name": "get_stock_price", "description": "Get the current stock price of a company", "parameters": {"type": "object", "properties": {"company": {"type": "string", "description": "The name of the company"}}, "required": ["company"]}}}
{"type": "function", "function": {"name": "get_movie_details", "description": "Get details about a movie", "parameters": {"type": "object", "properties": {"title": {"type": "string", "description": "The title of the movie"}}, "required": ["title"]}}}
</tools>

You first think/plan inside <think></think> tags.
Then for each function call, return a json object with function name and arguments within <tool_call></tool_call> tags.<|im_end|>
<|im_start|>user
Hi, can you tell me the current stock price of App

In [4]:
ds = load_from_disk("/Users/ohi/Documents/GitHub/PersonalAssistant/datasets/merged_dataset")

In [5]:
ds

Dataset({
    features: ['instruction', 'input', 'output', 'thought_len', 'answer_len', 'conversations'],
    num_rows: 34125
})

In [9]:
for d in ds:
    conv = d['conversations']
    if conv.count("</answer>") > 2:
        print(conv)
        break

In [17]:
def openthought_code(data):
    thought_len, answer_len = 0, 0

    reply = data['output']
    thought = re.findall(r"<thoughts>(.*?)</thoughts>", reply, re.DOTALL)
    if len(thought) > 1:
        print(thought)
    thought = ''.join(thought).strip()
    thought_len += len(thought.split())
    
    end_tag = "</thoughts>"
    answer = reply[reply.find(end_tag)+len(end_tag):]
    answer = answer.strip()
    answer_len += len(answer.split()) #len(tokenizer.encode(answer))

    if end_tag not in reply:
        answer_len = 0

    final_answer = f"<think>\n{thought}\n</think>\n<answer>\n{answer}\n</answer>"
    final_answer = final_answer.replace("<thoughts>", "").replace("</thoughts>", "")
    
    output_data = {
        'thought_len': thought_len,
        'answer_len': answer_len,
        'conversations': [
            {"role": "user", 'content': data['input']},
            {"role": "assistant", 'content': final_answer}
        ]
    }

    return output_data

openthought_dataset = load_dataset("XeTute/Open-Coding-Thoughts")['train']
print("Dataset length:", len(openthought_dataset))
openthought_dataset = openthought_dataset.map(openthought_code)
openthought_dataset = openthought_dataset.map(lambda x: {"conversations": tokenizer.apply_chat_template(x['conversations'], tools=None, tokenize=False)})
print("OpenThought dataset length (after filter):", len(openthought_dataset))
# print(openthought_dataset[0]['conversations'])

Using the latest cached version of the dataset since XeTute/Open-Coding-Thoughts couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/ohi/.cache/huggingface/datasets/XeTute___open-coding-thoughts/default/0.0.0/b63d89b184b9048a18e2cd42be298db6e44ab255 (last modified on Sun Mar 16 12:25:21 2025).


Dataset length: 1025


Map: 100%|██████████| 1025/1025 [00:00<00:00, 15778.30 examples/s]


['\n    Before organizing my code, I should consider the following to avoid mistakes:\n    *   Make sure to create a clear directory structure to keep related files together.\n    *   Use a consistent naming convention for your files and variables to avoid confusion.\n    *   Be mindful of the file path and import statements to avoid errors.\n    ', "\n    Before defining variables, I should consider the following to avoid mistakes:\n    *   Make sure to use a consistent naming convention for variables to avoid confusion.\n    *   Avoid using magic numbers or hardcoded values, as they can be difficult to maintain.\n    *   Consider using a preprocessor-specific syntax for variables, such as Sass's `$` symbol or Less's `@` symbol.\n    ", "\n    Before creating mixins and functions, I should consider the following to avoid mistakes:\n    *   Make sure to use a consistent naming convention for mixins and functions to avoid confusion.\n    *   Avoid over-engineering and keep the code simp

Map: 100%|██████████| 1025/1025 [00:00<00:00, 13955.78 examples/s]

OpenThought dataset length (after filter): 1025





In [13]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("open-r1/codeforces-cots", "solutions_py_decontaminated")['train']

def take(d):
    return len(d['generation'].split()) <= 2000

ds = ds.filter(take)
print(len(ds))

Filter: 100%|██████████| 8133/8133 [00:08<00:00, 984.77 examples/s] 

769





In [14]:
print(ds[0]['generation'])

<think>
Okay, I need to solve this problem where I have to convert a given integer n into its binary representation without leading zeros. Let's think about how to approach this.

The problem says that n can be up to 1e6. Wait, 1e6 isn't that big. But even so, I need an efficient way to convert it to binary. But what's the standard way to do this in Python?

Hmm, Python has built-in functions for this. Like bin(n), which returns a string starting with '0b', so I can just slice that off. For example, bin(5) gives '0b101', so removing the first two characters gives '101'. That's exactly what the examples show. So for the input 5, output is 101. So using bin() seems straightforward.

But wait, the problem says to output without leading zeros. But bin() automatically doesn't have leading zeros except for the case when n is 0, but since n is a positive integer (as per the problem statement), n can't be zero. So using bin(n)[2:] should give the correct result.

So the code would be something

User: What is 16*8?
Assistant: 16 * 8 = 128


In [22]:
print(processor.chat_template)

<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>
{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}


In [None]:
<|im_start|>{% for message in messages %}
  {{message['role'] | capitalize}}
  {% if message['content'][0]['type'] == 'image' %}{{':'}}
  {% else %}{{': '}}
  {% endif %}
  {% for line in message['content'] %}
    {% if line['type'] == 'text' %}{{line['text']}}
    {% elif line['type'] == 'image' %}{{ '<image>' }}
    {% endif %}
  {% endfor %}<end_of_utterance>
{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}

In [None]:
chunks = ["Muhammad Yunus[a] (born 28 June 1940) is a Bangladeshi economist, businessman, and politician who has been serving as Chief Adviser of the Interim Government of Bangladesh since 8 August 2024.[1] Yunus was awarded the Nobel Peace Prize in 2006 for founding the Grameen Bank and pioneering the concepts of microcredit and microfinance.[2] Yunus has received several other national and international honors, including the United States Presidential Medal of Freedom in 2009 and the Congressional Gold Medal in 2010.[3]"]

In [1]:
from smolagents import DuckDuckGoSearchTool

search_tool = DuckDuckGoSearchTool()
print(search_tool("Who's the current president of Russia?"))

  from .autonotebook import tqdm as notebook_tqdm


## Search Results

[List of presidents of Russia - Wikipedia](https://en.wikipedia.org/wiki/List_of_presidents_of_Russia)
The office of the president of Russia is the highest authority in the Russian Federation.The holder is the federation's head of state and has formal presidency over the State Council as well as being the commander in chief of the Russian Armed Forces.The office was introduced in 1918 after the February Revolution with the current office emerging after a referendum of 1991. [1]

[Vladimir Putin - Wikipedia](https://en.wikipedia.org/wiki/Vladimir_Putin)
Putin, FIFA president Gianni Infantino and French president Emmanuel Macron at the 2018 FIFA World Cup Final in Russia as French forward Kylian Mbappé receives the best young player award In 2007, Putin led a successful effort on behalf of Sochi for the 2014 Winter Olympics and the 2014 Winter Paralympics , [ 469 ] the first Winter Olympic Games to ever be hosted by Russia.

[President of Russia - Wikipedia](https://en

In [None]:
from datasets import load_dataset

def process(data):
    for idx, message in enumerate(data['messages']):
        if message['role'] != 'assistant': continue
        content = message['content']
        tag = "</think>"
        pos = content.find(tag)
        answer = content[pos+len(tag):].strip()
        data['messages'][idx]['content'] = content[:pos].strip() + f"\n</think>\n<answer>\n{answer}\n</answer>"
    return data

# Login using e.g. `huggingface-cli login` to access this dataset
codeforces_cot = load_dataset("open-r1/codeforces-cots", "solutions_py_decontaminated")['train']#.select(range(500))
codeforces_cot = codeforces_cot.filter(lambda x: len(str(x['messages'])) < 8000)
delete_keys = list(codeforces_cot.column_names)
codeforces_cot = codeforces_cot.map(process)
codeforces_cot = codeforces_cot.map(lambda x: {"conversations": tokenizer.apply_chat_template(x['messages'], tools=None, tokenize=False)})
codeforces_cot = codeforces_cot.remove_columns(delete_keys)
print("Codeforces CoT dataset length:", len(codeforces_cot))

Map: 100%|██████████| 251/251 [00:00<00:00, 3643.22 examples/s]
Map: 100%|██████████| 251/251 [00:00<00:00, 4417.61 examples/s]

General reason dataset length: 251





In [34]:
print(codeforces_cot[0]['conversations'])

<|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside 'think' tag and provide final answer to user query inside 'answer' tag.
Respond in the following format:
<think>
Let's think step by step...
</think>
<answer>
The final answer is...
</answer><|im_end|>
<|im_start|>user
You will be given a competitive programming problem.
Analyze the maximum input constraints and identify the optimal algorithmic approach and data structures needed to process the largest possible test cases within the time and memory limits, then explain why your chosen implementation strategy is the most efficient solution. Please reason step by step about your solution approach, then provide a complete implementation in Python 3 that is thoroughly optimized for both speed and memory usage.

Your solution must read input from standard input (input()), write output to standard output (print()).
Do not include any debug prints or additional output.

Put your final sol

In [8]:
pip uninstall smolagents

Found existing installation: smolagents 1.12.0
Uninstalling smolagents-1.12.0:
  Would remove:
    /Users/ohi/Documents/GitHub/PersonalAssistant/.venv/bin/smolagent
    /Users/ohi/Documents/GitHub/PersonalAssistant/.venv/bin/webagent
    /Users/ohi/Documents/GitHub/PersonalAssistant/.venv/lib/python3.12/site-packages/smolagents-1.12.0.dist-info/*
    /Users/ohi/Documents/GitHub/PersonalAssistant/.venv/lib/python3.12/site-packages/smolagents/*
Proceed (Y/n)? ^C
Note: you may need to restart the kernel to use updated packages.


In [20]:
# Step 1: Install BeautifulSoup and requests
# Open your terminal or command prompt and run the following commands:
# pip install beautifulsoup4
# pip install requests

# Step 2: Import BeautifulSoup and requests
from bs4 import BeautifulSoup
import requests

# Step 3: Load the HTML content
url = 'https://www.geeksforgeeks.org/best-ci-cd-tools/'
response = requests.get(url)
html_content = response.text

# Step 4: Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Step 5: Extract text from HTML
# Example: Extract the text from a specific element
# element = soup.find('div',) #class_='example')
text_content = soup.get_text()

text_content = '\n'.join(filter(lambda x: len(x.strip()) > 0, text_content.split('\n\n')))

# Step 6: Print the extracted text
print(text_content)


15 Best CI/CD Tools That You Should Know - GeeksforGeeks

Skip to content
CoursesDSA to DevelopmentMachine Learning & Data ScienceGenerative AI & ChatGPTBecome AWS CertifiedDSA CoursesData Structure & Algorithm(C++/JAVA)Data Structure & Algorithm(Python)Data Structure & Algorithm(JavaScript)Programming LanguagesCPPJavaPythonJavaScriptCAll CoursesTutorialsPythonPython TutorialPython ProgramsPython QuizPython ProjectsPython Interview QuestionsPython Data StructuresJavaJava TutorialJava CollectionsJava 8 TutorialJava ProgramsJava QuizJava ProjectsJava Interview QuestionsAdvanced JavaProgramming LanguagesJavaScriptC++R TutorialSQLPHPC#CScalaPerlGo LanguageKotlinInterview CornerSystem Design TutorialCompany PreparationTop TopicsPractice Company QuestionsInterview ExperiencesExperienced InterviewsInternship InterviewsCompetitive ProgrammingMultiple Choice QuizzesAptitude for PlacementsComputer Science SubjectsOperating SystemDBMSComputer NetworksEngineering MathematicsComputer Organization 

In [25]:
import requests
from bs4 import BeautifulSoup

def extract_main_text(url):
    # Fetch the page content
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    # Parse with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove unwanted tags: scripts, styles, navs, headers, footers, etc.
    for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'noscript', 'iframe']):
        tag.decompose()

    # Remove ad/promo/banner-related classes or IDs
    ad_classes = ['advertisement', 'ad', 'adsbygoogle', 'promo', 'banner', 'cookie-banner', 'subscribe']
    for class_name in ad_classes:
        for tag in soup.select(f'.{class_name}, #{class_name}'):
            tag.decompose()

    # Remove all anchor tags including their text
    for a_tag in soup.find_all('a'):
        a_tag.decompose()

    # Extract visible text from the body or the entire soup if body missing
    body = soup.body if soup.body else soup
    text = body.get_text(separator='\n', strip=True)

    # Clean up excessive empty lines and whitespace
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    lines = list(filter(lambda x: len(x) > 32, lines))
    clean_text = '\n'.join(lines)

    return clean_text

# Example usage
url = "https://www.geeksforgeeks.org/best-ci-cd-tools/"
print(extract_main_text(url))


15 Best CI/CD Tools That You Should Know
(CI/CD) is modern-day practices to continuously improve the software by adding new feature and for that we have to change the existing files and deploy them. Doing it manually every time may consume a lot of developer's time, so to make the process more simple and reliable developers use
that are much more efficient and make the whole process automated. As a software Engineer or beginner in development,
you should know the best CI/CD tools
As you go deep into software development, you will ultimately need these tools to save time and release a more reliable and stable version of your software application with proper automated testing and deploying processes. But before knowing the tools let’s first under what exactly is CI and CD.
CI and CD stand for continuous Integration and are continuous Deployment/Delivery which is
used in the process of building efficient software
. These are the approaches that are used to integrate the code changes into 

In [32]:
import requests
from bs4 import BeautifulSoup

def extract_markdown_text(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove scripts, styles, navs, headers, footers, and typical ad elements
    for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'noscript', 'iframe']):
        tag.decompose()

    ad_classes = ['advertisement', 'ad', 'adsbygoogle', 'promo', 'banner', 'cookie-banner', 'subscribe']
    for class_name in ad_classes:
        for tag in soup.select(f'.{class_name}, #{class_name}'):
            tag.decompose()

    # Remove all links and their content
    for a_tag in soup.find_all('a'):
        a_tag.decompose()

    # Markdown content building
    markdown_lines = []

    # Process headings and paragraphs
    for element in soup.body.descendants if soup.body else soup.descendants:
        if element.name:
            if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                level = int(element.name[1])
                markdown_lines.append(f"{'#' * level} {element.get_text(strip=True)}\n")
            elif element.name in ['ul', 'ol']:
                for li in element.find_all('li'):
                    markdown_lines.append(f"- {li.get_text(strip=True)}")
            elif element.name == 'p':
                text = element.get_text(strip=True)
                if text:
                    markdown_lines.append(f"{text}\n")

    # Final cleanup: remove empty lines
    markdown_lines = list(filter(lambda x: len(x) > 2, markdown_lines))
    markdown_content = '\n'.join([line for line in markdown_lines if line.strip()])
    
    return markdown_content

# Example usage
url = "https://www.geeksforgeeks.org/best-ci-cd-tools/"
print(extract_markdown_text(url))


# 15 Best CI/CD Tools That You Should Know

Continuous IntegrationandContinuous Delivery(CI/CD) is modern-day practices to continuously improve the software by adding new feature and for that we have to change the existing files and deploy them. Doing it manually every time may consume a lot of developer's time, so to make the process more simple and reliable developers useCI/CD toolsthat are much more efficient and make the whole process automated. As a software Engineer or beginner in development,you should know the best CI/CD tools.

As you go deep into software development, you will ultimately need these tools to save time and release a more reliable and stable version of your software application with proper automated testing and deploying processes. But before knowing the tools let’s first under what exactly is CI and CD.

## What is CI/CD?

CI and CD stand for continuous Integration and are continuous Deployment/Delivery which isused in the process of building efficient software

In [3]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained(
  "/Users/ohi/Documents/GitHub/PersonalAssistant/weights/SmolThink-360M-sft/checkpoint-45807",
#   revision="v2.0.1" # tag name, or branch name, or commit hash
)

tokenizer = AutoTokenizer.from_pretrained("/Users/ohi/Documents/GitHub/PersonalAssistant/weights/SmolThink-360M-sft/checkpoint-45807")

In [4]:
model.push_to_hub("SmolThink")

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67e5c3bf-08a9130203aef20f66ccf6d7;92755505-03bd-4d73-a65e-0a459a938b89)

Invalid username or password.

In [6]:
!transformers-cli login

usage: transformers-cli <command> [<args>]
Transformers CLI tool: error: argument {chat,convert,download,env,run,serve,add-new-model-like,add-fast-image-processor}: invalid choice: 'login' (choose from 'chat', 'convert', 'download', 'env', 'run', 'serve', 'add-new-model-like', 'add-fast-image-processor')


In [None]:
[
    {"role": "user", "content": "Hi, can you tell me the current stock price of Apple?"}, 
    {"role": "assistant", "content": "<think>\nAlright, the user asked for the current stock price of Apple. Looking at the tools available, there's a function called get_stock_price which exactly fits this request. I'll need to call this function with the parameter company set to 'Apple' to retrieve the information the user is asking for. This seems like the most straightforward and appropriate action to take.\n</think>\n<tool_call>\n[{'name': 'get_stock_price', 'arguments': {'company': 'Apple'}}]\n</tool_call>"}, 
    {"role": "tool", "content": "{'stock_price': '$150.75'}"}, 
    {"role": "assistant", "content": "The current stock price of Apple is $150.75."}, 
    {'role': 'user', 'content': "That's great. Can you also tell me the stock price for Microsoft?"}, 
    {'role': 'assistant', 'content': "<think>\nAlright, let me try to figure out how to handle this user's request. The user first asked about Apple's stock price, and I responded by calling the get_stock_price function with 'Apple' as the argument. The tool provided the price, and I passed that information back to the user.\n\nNow, the user is asking for the stock price of Microsoft. I need to determine the appropriate action here. Looking at the tools available, there's still the get_stock_price function, which is designed to fetch current stock prices. Since the user is requesting another company's stock price, it makes sense to use the same function again but with the company name changed to Microsoft. \n\nI shouldn't assume any other tools are needed because the available functions are specifically tailored for stock prices and movie details. In this case, the relevant function is get_stock_price. I should ensure that the function is called with the correct parameter, 'Microsoft' in this case, to retrieve the accurate information the user is seeking. This approach is efficient and aligns directly with the user's request without making unnecessary assumptions or using unrelated tools.\n</think>\n<tool_call>\n[{'name': 'get_stock_price', 'arguments': {'company': 'Microsoft'}}]\n</tool_call>"}, {'role': 'tool', 'content': "{'stock_price': '$210.22'}"}, {'role': 'assistant', 'content': 'The current stock price of Microsoft is $210.22.'}, {'role': 'user', 'content': 'Thank you for the information.'}, {'role': 'assistant', 'content': "You're welcome! If you have any other questions, feel free to ask."}]


In [None]:
[
  {
    "role": "user",
    "content": "You are a helpful assistant."
  },
  {
    "role": "user",
    "content": "Hello, how are you?"
  },
  {
    "role": "assistant",
    "content": "I'm doing great. How can I help you today?"
  },
  {
    "role": "user",
    "content": "I'd like to show off how chat templating works!"
  }
]

In [None]:
{{- if .Messages }}
{{- if .Tools }}<|im_start|>system
You are a helpful AI assistant named SmolThink.

# Tools
 
You may call one or more functions to assist with the user query.

You are provided with function signatures within <tools></tools> tags:
<tools>
{{ .Tools }}
</tools>

You first think/plan inside <think></think> tags.
Then for each function call, return a json object with function name and arguments within <tool_call></tool_call> tags.<|im_end|>
{{ else if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{- else if not .Tools }}
<|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside <think></think> tag and provide final answer to user query inside <answer></answer> tag.<|im_end|>
{{ end }}
{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1 -}}
{{- if eq .Role "user" }}<|im_start|>user
{{ .Content }}<|im_end|>
{{ else if eq .Role "assistant" }}<|im_start|>assistant
{{ if .Content }}{{ .Content }}
{{- else if .ToolCalls }}<tool_call>[
{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}
]</tool_call>
{{- end }}{{ if not $last }}<|im_end|>
{{ end }}
{{- else if eq .Role "tool" }}<|im_start|>user
<tool_response>
{{ .Content }}
</tool_response><|im_end|>
{{ end }}
{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
{{ end }}
{{- end }} 
{{- else }}
{{- if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = 'mps'

tokenizer = AutoTokenizer.from_pretrained(
    "quwsarohi/SmolThink"
)

model = AutoModelForCausalLM.from_pretrained(
    "quwsarohi/SmolThink",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    use_cache=False,
    tie_word_embeddings=True,
).to(device)

messages = [{"role": "user", "content": "What is 12+9?"}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

<|endoftext|><|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside <think></think> tag and provide final answer to user query inside <answer></answer> tag.<|im_end|>
<|im_start|>user
What is 12+9?.<|im_end|>

<|endoftext|><|im_start|>system
You are a helpful AI assistant named SmolThink. First plan/reason/code/validate inside <think></think> tag and provide final answer to user query inside <answer></answer> tag.<|im_end|>
<|im_start|>user
What is 12+9?.<|im_end|>
<|im_start|>assistant
<think>
</think>
<answer>
12 + 9 = 21
</answer><|im_end|>


In [9]:
import os, sys
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, StoppingCriteria
from datasets import load_dataset
import peft
from duckduckgo_search import DDGS

import random
import re
import json
import ast
from copy import deepcopy
from pathlib import Path

from typing import Optional, List
from jinja2 import Template
from transformers.utils import get_json_schema

from bs4 import BeautifulSoup
import requests


def url_content(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove scripts, styles, navs, headers, footers, and typical ad elements
    for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'noscript', 'iframe']):
        tag.decompose()

    ad_classes = ['advertisement', 'ad', 'adsbygoogle', 'promo', 'banner', 'cookie-banner', 'subscribe']
    for class_name in ad_classes:
        for tag in soup.select(f'.{class_name}, #{class_name}'):
            tag.decompose()

    # Remove all links and their content
    for a_tag in soup.find_all('a'):
        a_tag.decompose()

    # Markdown content building
    markdown_lines = []

    # Process headings and paragraphs
    for element in soup.body.descendants if soup.body else soup.descendants:
        if element.name:
            if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                level = int(element.name[1])
                markdown_lines.append(f"{'#' * level} {element.get_text(strip=True)}\n")
            elif element.name in ['ul', 'ol']:
                for li in element.find_all('li'):
                    markdown_lines.append(f"- {li.get_text(strip=True)}")
            elif element.name == 'p':
                text = element.get_text(strip=True)
                if text:
                    markdown_lines.append(f"{text}\n")

    # Final cleanup: remove empty lines
    markdown_lines = list(filter(lambda x: len(x) > 2, markdown_lines))
    markdown_content = '\n'.join([line for line in markdown_lines if line.strip()])    
    return markdown_content


def search_tool(search_str, full_content=True, max_results=1):
    rets = None
    with DDGS() as ddg:
        rets = list(ddg.text(keywords=search_str, region="wt-wt", max_results=max_results))

    str_rets = ''
    if full_content:
        str_rets = "Use following information to answer user_question:\n\n"
        for i, r in enumerate(rets):
            r = url_content(rets[i]['href'])[:1024*3]
            str_rets += f"# Source {i+1}:\n" + "-"*10 + f"\n\n{r}\n\n\n"
    else:
        str_rets = "Use following information to answer user_question:\n\n"
        for i, r in enumerate(rets):
            str_rets += f"# Source {i+1}:\n" + "-"*10 + f"\n\n{r}\n\n\n"
        
    return str_rets

In [15]:
print(search_tool("perform math operation: 32 + 64 = ?", max_results=2))

Use following information to answer user_question:

# Source 1:
----------

# Math Calculator

Step 1:

Enter the expression you want to evaluate.

The Math Calculator will evaluate your problem down to a final solution. You can also add, subtraction, multiply, and divide and complete any arithmetic you need.

Step 2:

Click theblue arrowto submit and see your result!

Please ensure that your password is at least 8 characters and contains each of the following:

- a number
- a letter
- a special character: @$#!%*?&


# Source 2:
----------

# Basic Operations Calculator

## Number Line

- Show More
basic-operations-calculator

en

Please add a message.

Message received. Thanks for the feedback.






In [None]:
import requests
def search_ddg(query):
    url = "https://api.duckduckgo.com/"
    params = {"q": query, "format": "json"}
    response = requests.get(url, params=params)
    return response.json()
results = search_ddg("Python programming")
print(results["RelatedTopics"])


def fetch_instant_answer(query):
    response = search_ddg(query)
    return response.get("AbstractText", "No instant answer available.")



[{'FirstURL': 'https://duckduckgo.com/c/Python_(programming_language)', 'Icon': {'Height': '', 'URL': '', 'Width': ''}, 'Result': '<a href="https://duckduckgo.com/c/Python_(programming_language)">Python (programming language) Category</a>', 'Text': 'Python (programming language) Category'}, {'FirstURL': 'https://duckduckgo.com/Python_syntax_and_semantics', 'Icon': {'Height': '', 'URL': '', 'Width': ''}, 'Result': '<a href="https://duckduckgo.com/Python_syntax_and_semantics">Python syntax and semantics</a> - The syntax of the Python programming language is the set of rules that defines how a Python program will be written and interpreted. The Python language has many similarities to Perl, C, and Java. However, there are some definite differences between the languages.', 'Text': 'Python syntax and semantics - The syntax of the Python programming language is the set of rules that defines how a Python program will be written and interpreted. The Python language has many similarities to Per

In [8]:
print(fetch_instant_answer("How to commit in github"))




In [21]:

%pip uninstall llama-index llama-index-readers-web

Found existing installation: llama-index 0.12.27
Uninstalling llama-index-0.12.27:
  Would remove:
    /Users/ohi/Documents/GitHub/PersonalAssistant/.venv/bin/llamaindex-cli
    /Users/ohi/Documents/GitHub/PersonalAssistant/.venv/lib/python3.12/site-packages/llama_index-0.12.27.dist-info/*
    /Users/ohi/Documents/GitHub/PersonalAssistant/.venv/lib/python3.12/site-packages/llama_index/_bundle/*
Proceed (Y/n)? ^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
import urllib.request
from io import BytesIO
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

def docling_cleanup(input_str):
    # <!-- image --> tag cleanup
    input_str = input_str.replace('<!-- image -->', '')
    # Lines with empty spaces
    lines = filter(lambda x: not x.isspace() and len(x) > 3, input_str.split('\n'))
    input_str = '\n'.join(list(lines))
    del lines
    # clean excessive newlines
    _cnt = 0
    ret_str = ''
    for c in input_str:
        if c == '\n':
            _cnt += 1
            if _cnt > 2: continue
            else: ret_str += c
        else:
            _cnt = 0
            ret_str += c
    return ret_str

import requests

def url_content(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    ascii_text = response.text.encode("ascii", "ignore")
    ascii_text = re.sub(r'[^A-Za-z0-9\s.,;:!?()"\']+', '', response.text)
    in_doc = InputDocument(
        path_or_stream=BytesIO(ascii_text),
        format=InputFormat.HTML,
        backend=HTMLDocumentBackend,
        filename="duck.html",
    )

    backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(ascii_text))
    dl_doc = backend.convert()
    return docling_cleanup(dl_doc.export_to_markdown())

In [20]:
ret = url_content('https://stackoverflow.com/questions/3939660/sieve-of-eratosthenes-finding-primes-python')
print(ret)




In [9]:
print(ret)




In [None]:
import ollama

def ollama_infr(prompt, extra_stops=[], model='deepseek-r1:7b', temperature=0.7):
    # https://github.com/ollama/ollama-python/blob/00eafed0faa5dea6879a8eb3229c7a8f2439abb4/ollama/_types.py#L93
    return ollama.generate(
        model = model,
        # system = system,
        # Raw is set to true to feed the question as needed
        raw=True,
        prompt = prompt,
        stream = True,
        # Number of seconds to keep the connection alive
        keep_alive=-1, # Will keep the model loaded,
        options = {
            'stop': [
                "<|start_header_id|>",
                "<|end_header_id|>",
                "<|eot_id|>",
            ] + extra_stops,
            'temperature': temperature,
            # 'top_k': 1,
            'cache': False,
            # 'tfs_z': 2.0,
            'num_ctx': 6000,
            # 'temperature': 0.0,
            # 'top_p': 0.0
        },
    )

In [15]:
# deepseek_r1 = '''You are a helpful AI. You will be given a question and a web content. You have to provide a summary of the web content.

# <｜User｜>User question: {question}

# Here is the web content:
# {context}

# <｜end▁of▁sentence｜>
# <｜Assistant｜>
# <think>

# </think>

# '''

deepseek_r1 = '''<|im_start|>system
You are a helpful AI. You will be given a question and a web content. You have to provide a summary of the web content.<|im_end|>
<|im_start|>user
User question: {question}

Here is the web content:
{context}<|im_end|>
<|im_start|>assistant
'''

def r1_response(question, context):    
    prompt = deepseek_r1.format(question=question, context=context)
    # print("Question:", data['question'], flush=True)

    stream = ollama_infr(prompt=prompt, model='SmolThink:latest', temperature=0.5)
    model_res = '<think>\n'
    n_think_tokens = 0
    think_finished = False

    for part in stream:
        print(part['response'], sep='', end='', flush=True)
        model_res += part['response']

        if not think_finished and '</think>' in model_res:
            think_finished = True
        if not think_finished:
            n_think_tokens += 1

        # if n_think_tokens > 386:
            # print("Generation limit exceeded", flush=True)
            # return None, None
        
        if len(model_res) > 6000:
            return None, None

    think = re.findall(r"<think>(.*?)</think>", model_res, re.DOTALL)[0].strip()
    answer = model_res[model_res.find("</think>")+len("</think>"):].strip()

    if not answer:
        return None, None
    
    return think, answer

print(r1_response("How to find prime numbers?", ret))

The user is asking about the use of the `ST_AsEWKB` function in Python, specifically for converting a signal from 50 Hz to 60 Hz. They are interested in understanding how this function works and its potential applications. Additionally, they would like more information on whether there's precedent or precedent-setting cases where politicians have been barred from running for office due to fraud or embezzlement. The user also seeks help with the technical aspects of their query, including how to subscribe to a specific feed.

NameError: name 're' is not defined

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = 'mps'

tokenizer = AutoTokenizer.from_pretrained(
    "quwsarohi/SmolThink"
)

model = AutoModelForCausalLM.from_pretrained(
    "quwsarohi/SmolThink",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    use_cache=False,
    tie_word_embeddings=True,
).to(device)

messages = [{"role": "user", "content": "What is the capital of France."}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

In [None]:
webtool_def = {
    "type": "function",
    "function": {
        "name": "web_search",
        "description": "Can search the web for infomation which are doubtful/unknown/recent.",
        "parameters": {
            "type": "object",
            "properties": {
                "search_str": {
                    "type": "string",
                    "description": "The whole question you want to ask.",
                    "required": True,
                }
            },
        },
    },
}

base_prompt = tokenizer.apply_chat_template([
    {"role": "user", "content": "What is the current stock price of Apple?"}
], tools=[webtool_def], tokenize=False, add_generation_prompt=True)
print(base_prompt)

inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))