In [1]:
import kagglehub
import pandas as pd
import json
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from tokenizers import Tokenizer
from bpe import Encoder


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


### Import Data


In [2]:
path = kagglehub.dataset_download("mathurinache/math-dataset")

In [3]:

problems = {"problem": [], "level": [], "type": [], "solution": []}

with os.scandir(path + "/MATH/train/") as types:
    for type in types:
        if type.is_dir():
            with os.scandir(type) as entries:
                for entry in entries:
                    if entry.is_file():
                        # print(entry.path)
                        with open(entry.path) as f:
                            data = json.load(f)
                            for key in data:
                                problems[key].append(data[key])

all_levels = pd.DataFrame.from_dict(problems, orient='columns')

BLOCK_SIZE = max(all_levels["problem"].apply(len) + all_levels["solution"].apply(len))
BLOCK_SIZE

7333

#### Split data

In [4]:
train, test = train_test_split(all_levels, test_size=0.15, random_state=42, stratify=all_levels['level'])

print(len(train[train['level'] == "Level 1"]), '/', len(train))
print(len(test[test['level'] == "Level 1"]), '/', len(test))

479 / 6375
85 / 1125


### Tokenizer Training

In [5]:
corpus_arr = train["problem"].tolist() + train["solution"].tolist() + test["problem"].tolist() + test["solution"].tolist()

corpus = " ".join(corpus_arr)

encoder = Encoder(500, pct_bpe=0.88)
encoder.fit(corpus.split("\n"))

example = "Add two numbers 3 and 4"
print(next(encoder.transform([example])))

[61, 229, 56, 60, 61, 282, 69, 60, 61, 225, 230, 86, 66, 60, 17, 24, 25]


### Model Training

In [6]:
from mingpt.model import GPT
model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = encoder.vocab_size
model_config.block_size = 1024
model = GPT(model_config)

BLOCK_SIZE = 1024

print(encoder.vocab_size)

number of parameters: 0.16M
500


#### Create dataset

In [7]:
from mingpt.bpe import BPETokenizer

tokenizer = BPETokenizer()

class MathDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def get_block_size(self):
        return BLOCK_SIZE

    def __getitem__(self, idx):
        problem = self.data.iloc[idx]['problem']
        solution = self.data.iloc[idx]['solution']
        input = next(encoder.transform([problem + " " + solution]))
        # print(input)
        x = torch.Tensor(input[:-1]).long()
        y = torch.Tensor(input[1:]).long()
        
        if len(x) > BLOCK_SIZE:
            return self.__getitem__(idx + 1)
        
        if len(x) < BLOCK_SIZE:
            x = torch.cat((x, torch.zeros(BLOCK_SIZE - len(x)).long()))
            y = torch.cat((y, torch.zeros(BLOCK_SIZE - len(y)).long()))

        

        y[:len(problem)] = -1
        # print(x, y)
        return x, y
        
    
train_dataset = MathDataset(train)
test_dataset = MathDataset(test)

train_dataset.__getitem__(0)

(tensor([ 61, 222,  88,  ...,   0,   0,   0]),
 tensor([-1, -1, -1,  ...,  0,  0,  0]))

In [10]:
from mingpt.trainer import Trainer
train_config = Trainer.get_default_config()
train_config.learning_rate = 1e-5 # many possible options, see the file
train_config.max_iters = 1000
train_config.batch_size = 32
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [11]:
def batch_end_callback(trainer):
    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 0.81358
iter_dt 633.39ms; iter 10: train loss 1.17957
iter_dt 632.46ms; iter 20: train loss 1.09480
iter_dt 632.47ms; iter 30: train loss 0.92683
iter_dt 630.57ms; iter 40: train loss 1.05396
iter_dt 631.93ms; iter 50: train loss 0.87323
iter_dt 629.09ms; iter 60: train loss 0.75698
iter_dt 634.79ms; iter 70: train loss 0.84890
iter_dt 630.92ms; iter 80: train loss 0.87772
iter_dt 633.35ms; iter 90: train loss 0.98032


KeyboardInterrupt: 

In [12]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(500, 48)
    (wpe): Embedding(1024, 48)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-2): 3 x Block(
        (ln_1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=48, out_features=144, bias=True)
          (c_proj): Linear(in_features=48, out_features=48, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=48, out_features=192, bias=True)
          (c_proj): Linear(in_features=192, out_features=48, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=48, o

In [30]:
input = test_dataset.__getitem__(0)
print(input[0])
output = model.generate(input[0].unsqueeze(0).cuda(), 100)
print(next(encoder.inverse_transform(output.cpu().numpy().tolist())))

tensor([  7,  61, 125,  ...,   0,   0,   0])
the line joining $( 3 , 2 )$ and $( 6 , 0 )$ divides the square shown into two parts . what fraction of the area of the square is above this line ? express your answer as a common fraction . [ asy ] draw ((- 2 , 0 )--( 7 , 0 ), linewidth ( 1 ), arrows ); draw (( 0 ,- 1 )--( 0 , 4 ), linewidth ( 1 ), arrows ); draw (( 1 ,. 25 )--( 1 ,-. 25 ), linewidth ( 1 )); draw (( 2 ,. 25 )--( 2 ,-. 25 ), linewidth ( 1 )); draw (( 3 ,. 25 )--( 3 ,-. 25 ), linewidth ( 1 )); draw (( 4 ,. 25 )--( 4 ,-. 25 ), linewidth ( 1 )); draw (( 5 ,. 25 )--( 5 ,-. 25 ), linewidth ( 1 )); draw (( 6 ,. 25 )--( 6 ,-. 25 ), linewidth ( 1 )); draw ((. 25 , 1 )--(-. 25 , 1 ), linewidth ( 1 )); draw ((. 25 , 2 )--(-. 25 , 2 ), linewidth ( 1 )); draw ((. 25 , 3 )--(-. 25 , 3 ), linewidth ( 1 )); draw (( 3 , 0 )--( 6 , 0 )--( 6 , 3 )--( 3 , 3 )--( 3 , 0 )-- cycle , linewidth ( 2 )); label ("$ y $",( 0 , 4 ), n ); label ("$ x $",( 7 , 0 ), e ); label ("$( 3 , 0 )$",( 3 , 0 ), s )

In [31]:
torch.save(model.state_dict(), "../weights/math_model.pth")