In [1]:
import torch
from Cdatasets.tokenizer import MathTokenizer, load_tokenizer
from scripts.model import GQATransformer
from transformers import AutoTokenizer
from Cdatasets.dataset import PretrainDataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from Cdatasets.dataset import PretrainDataset
from Cdatasets.tokenizer import load_tokenizer

tok = load_tokenizer("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

ds = PretrainDataset(
    data_dir="math-ai/AutoMathText",   # HF dataset id, not a local path
    tokenizer=tok,
    block_size=256,
    subset="arxiv-0.70-to-1.00",       # optional
    split="train",
    hf_cache_dir="/home/prasanna/.cache/huggingface/datasets",
    local_files_only=True,    
    max_stream_rows=100000
                      # use only cached files
)
print(len(ds))

Token indices sequence length is longer than the specified maximum sequence length for this model (16652 > 2048). Running this sequence through the model will result in indexing errors


In [10]:
ds[0]['input_ids'].shape, ds[0]['targets'].shape

(torch.Size([256]), torch.Size([256]))

In [2]:
tokenizer = load_tokenizer("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

dataset = PretrainDataset(
    data_dir="data/AutoMathText",
    tokenizer=tokenizer,
    block_size=512,
)

In [5]:
from datasets import load_from_disk

ds = load_from_disk("data/AutoMathText")  # change path if needed
print(ds)
print("rows:", len(ds))
print("columns:", ds.column_names)

# first row raw
row0 = ds[0]
print("\nrow[0] keys:", row0.keys())
print("title:", (row0.get("title") or "")[:120])
print("abstract:", (row0.get("abstract") or "")[:200])
print("text:", (row0.get("text") or "")[:500])


Dataset({
    features: ['url', 'title', 'abstract', 'text', 'meta'],
    num_rows: 45169
})
rows: 45169
columns: ['url', 'title', 'abstract', 'text', 'meta']

row[0] keys: dict_keys(['url', 'title', 'abstract', 'text', 'meta'])
title: Convergence directions of the randomized Gauss--Seidel method and its extension
abstract: The randomized Gauss--Seidel method and its extension have attracted much attention recently and their convergence rates have been considered extensively. However, the convergence rates are usually de
text: \section{Introduction}
Linear least squares problem is a ubiquitous problem arising frequently in data analysis and scientific computing. Specifically, given a data matrix $A\in R^{m\times n}$ and a data vector $b\in R^{m}$, a linear least squares problem can be written as follows
\begin{equation}
\label{ls}
\min \limits _{ x \in R^{n}}\|b-Ax\|^2_{2}.
\end{equation}
In the literature, several direct methods have been proposed for solving its normal equations $A^T

In [9]:
from Cdatasets.dataset import PretrainDataset
from Cdatasets.tokenizer import load_tokenizer

tok = load_tokenizer("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
pds = PretrainDataset("data/AutoMathText", tok, block_size=256)

# pick raw HF row and format it the same way as training
raw = ds[0]
formatted = pds._format_row(raw)
print(formatted[:1000])
tok.vocab_size

Convergence directions of the randomized Gauss--Seidel method and its extension

The randomized Gauss--Seidel method and its extension have attracted much attention recently and their convergence rates have been considered extensively. However, the convergence rates are usually determined by upper bounds, which cannot fully reflect the actual convergence. In this paper, we make a detailed analysis of their convergence behaviors. The analysis shows that the larger the singular value of $A$ is, the faster the error decays in the corresponding singular vector space, and the convergence directions are mainly driven by the large singular values at the beginning, then gradually driven by the small singular values, and finally by the smallest nonzero singular value. These results explain the phenomenon found in the extensive numerical experiments appearing in the literature that these two methods seem to converge faster at the beginning. Numerical examples are provided to confirm the above fi

32000

In [None]:
# tokenizer = MathTokenizer()
teacher = GQATransformer(
    num_layers=tc.num_layers,
    n_emb=tc.n_embd,
    n_head=tc.n_head,
    n_kv_head=tc.n_kv_head,
    vocab_size=tc.vocab_size,
    block_size=tc.block_size,
    dropout=tc.dropout,
)

student = GQATransformer(
    num_layers=sc.num_layers,
    n_emb=sc.n_embd,
    n_head=sc.n_head,
    n_kv_head=sc.n_kv_head,
    vocab_size=sc.vocab_size,
    block_size=sc.block_size,
    dropout=sc.dropout,
)


def count_parameters(model: GQATransformer) -> int:
    return sum(p.numel() for p in model.parameters())

print(f"Teacher: {count_parameters(teacher) / 1e6:.2f}M parameters")
print(f"Student: {count_parameters(student) / 1e6:.2f}M parameters")

Teacher: 60.90M parameters
Student: 39.69M parameters


In [7]:
print(f"Teacher: {count_parameters(teacher) / 1e6:.2f}M parameters")
print(f"Student: {count_parameters(student) / 1e6:.2f}M parameters")

Teacher: 121.74M parameters
Student: 27.51M parameters


In [None]:
deepseek_tokenizer = AutoTokenizer.from_pretrained(
    "deepseek-ai/deepseek-math-7b-base",
    trust_remote_code=True
)

text = r"""
Solve: \int_0^1 x^2 dx.
Chain of thought: First compute the antiderivative...
"""

tokens = deepseek_tokenizer(text, return_tensors="pt")

print("Token IDs:", tokens["input_ids"][0][:20])
print("Decoded:", deepseek_tokenizer.decode(tokens["input_ids"][0]))
print("Vocab size:", deepseek_tokenizer.vocab_size)
print("Special tokens:", deepseek_tokenizer.special_tokens_map)

In [None]:

qwen_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-Math-7B",
    trust_remote_code=True
)

text = r"""
证明：若 a,b \in \mathbb{R}, 则 a^2 + b^2 \ge 2ab.
"""

tokens = qwen_tokenizer(text, return_tensors="pt")

print("Token IDs:", tokens["input_ids"][0][:20])
print("Decoded:", qwen_tokenizer.decode(tokens["input_ids"][0]))
print("Vocab size:", qwen_tokenizer.vocab_size)
print("Special tokens:", qwen_tokenizer.special_tokens_map)

In [6]:

llama_tokenizer = load_tokenizer("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

text = "If 12345^2 = ?, compute step-by-step."

tokens = llama_tokenizer(text, return_tensors="pt")

print("Token IDs:", tokens["input_ids"][0][:20])
print("Decoded:", llama_tokenizer.decode(tokens["input_ids"][0]))
print("Vocab size:", llama_tokenizer.vocab_size)
print("Special tokens:", llama_tokenizer.special_tokens_map)

Token IDs: tensor([    1,   960, 29871, 29896, 29906, 29941, 29946, 29945, 29985, 29906,
          353,  1577, 29892, 10272,  4331, 29899,  1609, 29899, 10568, 29889])
Decoded: <s> If 12345^2 = ?, compute step-by-step.
Vocab size: 32000
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}
