In [1]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-Reasoning/resolve/main/data/deepmath-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-Reasoning/resolve/main/data/iium_confession_translation-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-Reasoning/resolve/main/data/leetcode_hard-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-Reasoning/resolve/main/data/maktabalbahri-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-Reasoning/resolve/main/data/mallm-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-Reasoning/resolve/main/data/tatabahasa-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malay-Dialect-Reasoning/resolve/main/data/dialect_translation_train-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malay-Dialect-Reasoning/resolve/main/data/reverse_dialect_translation_train-00000-of-00001.parquet

In [2]:
# !pip3 install fasttext

In [3]:
from huggingface_hub import hf_hub_download
import fasttext

filename = hf_hub_download(
    repo_id="mesolitica/fasttext-language-detection-bahasa-en", 
    filename="fasttext.ftz"
)
lang_model = fasttext.load_model(filename)
lang_model.predict('hello my name', k = 1)

(('__label__english',), array([0.99767441]))

In [4]:
import pandas as pd
from glob import glob
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

In [5]:
system = 'First, you try to think step-by-step in {{lang}}, after that, put your final answer within $\\boxed{}$.'
system_second = 'Try to think step-by-step in {{lang}}'

def get_prompt(s):
    if 'boxed{' in s:
        return system
    else:
        return system_second

def get_lang(s):
    if 'bahasa' in lang_model.predict(s.replace('\n', ' '), k = 1)[0][0]:
        return 'malay'
    else:
        return 'english'

In [6]:
# conversation = tokenizer.apply_chat_template(messages, tokenize=False)

In [7]:
files = glob('*.parquet')
files

['reverse_dialect_translation_train-00000-of-00001.parquet',
 'dialect_translation_train-00000-of-00001.parquet',
 'tatabahasa-00000-of-00001.parquet',
 'mallm-00000-of-00001.parquet',
 'maktabalbahri-00000-of-00001.parquet',
 'leetcode_hard-00000-of-00001.parquet',
 'iium_confession_translation-00000-of-00001.parquet',
 'deepmath-00000-of-00001.parquet']

In [8]:
df = pd.read_parquet('tatabahasa-00000-of-00001.parquet')

data = []
for i in range(len(df)):
    l = get_lang(df['malay'].iloc[i])
    p = get_prompt(df['malay'].iloc[i])
    
    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': df['content'].iloc[i]},
        {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['malay'].iloc[i]},
    ]
    data.append(messages)

len(df), len(data)

(1359, 1359)

In [9]:
df = pd.read_parquet('mallm-00000-of-00001.parquet')

for i in range(len(df)):
    l = get_lang(df['malay'].iloc[i])
    p = get_prompt(df['malay'].iloc[i])
    
    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': df['content'].iloc[i]},
        {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['malay'].iloc[i]},
    ]
    data.append(messages)

len(df), len(data)

(6610, 7969)

In [10]:
df = pd.read_parquet('maktabalbahri-00000-of-00001.parquet')

for i in range(len(df)):
    p = get_prompt(df['malay'].iloc[i])
    l = get_lang(df['malay'].iloc[i])
    
    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': df['content'].iloc[i]},
        {'role': 'assistant', 'content': df['malay'].iloc[i]},
    ]
    data.append(messages)

len(df), len(data)

(7479, 15448)

In [11]:
df = pd.read_parquet('leetcode_hard-00000-of-00001.parquet')

for i in range(len(df)):
    p = get_prompt(df['english'].iloc[i])
    l = get_lang(df['english'].iloc[i])
    
    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': df['content'].iloc[i]},
        {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['english'].iloc[i]},
    ]
    data.append(messages)

    p = get_prompt(df['malay'].iloc[i])
    l = get_lang(df['malay'].iloc[i])
    
    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': df['content'].iloc[i]},
        {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['malay'].iloc[i]},
    ]
    data.append(messages)


len(df), len(data)

(1561, 18570)

In [12]:
df = pd.read_parquet('iium_confession_translation-00000-of-00001.parquet')

for i in range(len(df)):

    p = get_prompt(df['malay'].iloc[i])
    l = get_lang(df['malay'].iloc[i])
    
    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': df['content'].iloc[i]},
        {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['malay'].iloc[i]},
    ]
    data.append(messages)

len(df), len(data)

(2981, 21551)

In [13]:
df = pd.read_parquet('deepmath-00000-of-00001.parquet')

for i in range(len(df)):
    if isinstance(df['english'].iloc[i], str) and len(df['english'].iloc[i]) > 2:
        p = get_prompt(df['english'].iloc[i])
        l = get_lang(df['english'].iloc[i])
        
        messages = [
            {'role': 'system', 'content': p.replace('{{lang}}', l)},
            {'role': 'user', 'content': df['content'].iloc[i]},
            {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['english'].iloc[i]},
        ]
        data.append(messages)

    if isinstance(df['malay'].iloc[i], str) and len(df['malay'].iloc[i]) > 2:
        p = get_prompt(df['malay'].iloc[i])
        l = get_lang(df['malay'].iloc[i])
        
        messages = [
            {'role': 'system', 'content': p.replace('{{lang}}', l)},
            {'role': 'user', 'content': df['content'].iloc[i]},
            {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['malay'].iloc[i]},
        ]
        data.append(messages)

len(df), len(data)

(3178, 24729)

In [14]:
import random

df = pd.read_parquet('dialect_translation_train-00000-of-00001.parquet')

prompts = [
    'terjemah ke melayu baku\n\n{q}',
    'terjemah ke standard melayu\n\n{q}',
    '{q}\n\nterjemah ke standard melayu',
]

for i in range(len(df)):

    p = get_prompt(df['malay'].iloc[i])
    l = get_lang(df['malay'].iloc[i])

    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': random.choice(prompts).replace('{q}', df['content'].iloc[i])},
        {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['malay'].iloc[i]},
    ]
    data.append(messages)

len(df), len(data)

(4468, 29197)

In [15]:
import random

df = pd.read_parquet('reverse_dialect_translation_train-00000-of-00001.parquet')

prompts = [
    'terjemah ke {l}\n\n{q}',
    '{q}\n\nterjemah ke {l}',
]

for i in range(len(df)):

    p = get_prompt(df['malay'].iloc[i])
    l = get_lang(df['malay'].iloc[i])

    messages = [
        {'role': 'system', 'content': p.replace('{{lang}}', l)},
        {'role': 'user', 'content': random.choice(prompts).replace('{q}', df['content'].iloc[i]).replace('{l}', df['language'].iloc[i])},
        {'role': 'assistant', 'content': df['solution'].iloc[i], 'thinking': df['malay'].iloc[i]},
    ]
    data.append(messages)

len(df), len(data)

(4520, 33717)

In [16]:
conversation = tokenizer.apply_chat_template(data[-1], tokenize=False)

In [19]:
import json

with open('prepared-reasoning-data.json', 'w') as fopen:
    json.dump(data, fopen)

In [20]:
!ls -lh prepared-reasoning-data.json

-rw-r--r-- 1 root root 271M Dec  9 07:16 prepared-reasoning-data.json
