In [1]:
import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
import os
import csv
import glob
import json
import math
import pickle
import random
import numpy as np
from numpy import * # to override the math functions
import torch
import torch.nn as nn
from torch.nn import functional as F
from utils import set_seed, sample_from_model
from trainer import Trainer, TrainerConfig
from models_new import DD2D, TransformerConfig
from utils import processDataFiles, CharDataset

In [2]:
# set the random seed
set_seed(42)
# config
device='cpu'
scratch=True # if you want to ignore the cache and start for scratch
numEpochs = 2000 # number of epochs to train the GPT+PT model
embeddingSize = 384 # the hidden dimension of the representation of both GPT and PT
numPoints = [20,350] # number of points that we are going to receive to make a prediction about f given x and y, if you don't know then use the maximum
numVars = 2 # the dimenstion of input points x, if you don't know then use the maximum
numYs = 1 # the dimension of output points y = f(x), if you don't know then use the maximum
blockSize = 180 # spatial extent of the model for its context
batchSize = 256 # batch size of training data
target = 'Skeleton' #'Skeleton' #'EQ'
dataDir = '../datasets/'

data_set = 0
if data_set == 0:
    dataInfo = 'XYE_{}Var_{}-{}Points_{}EmbeddingSize'.format(numVars, numPoints[0], numPoints[1], embeddingSize)
else:
    dataInfo = 'XYE_{}Var_{}-{}Points_{}EmbeddingSize_dataset{}'.format(numVars, numPoints[0], numPoints[1], embeddingSize, data_set)

titleTemplate = "{} equations of {} variables - Benchmark"
addr = '../SavedModels/' # where to save model
n_layer = 4
n_head = 4
method = 'EMB_SUM' # EMB_CAT/EMB_SUM/OUT_SUM/OUT_CAT/EMB_CON -> whether to concat the embedding or use summation. 
variableEmbedding = 'NOT_VAR' # NOT_VAR/LEA_EMB/STR_VAR
# NOT_VAR: Do nothing, will not pass any information from the number of variables in the equation to the GPT
# LEA_EMB: Learnable embedding for the variables, added to the pointNET embedding
# STR_VAR: Add the number of variables to the first token
addVars = True if variableEmbedding == 'STR_VAR' else False
maxNumFiles = 100 # maximum number of file to load in memory for training the neural network
bestLoss = None # if there is any model to load as pre-trained one
fName = '{}_DD2D_{}_{}_{}_MINIMIZE.txt'.format(dataInfo, 
                                             'layer_heads_{}_{}'.format(n_layer, n_head), 
                                             'Padding',
                                             variableEmbedding)
ckptPath = '{}/{}.pt'.format(addr,fName.split('.txt')[0])
try: 
    os.mkdir(addr)
except:
    print('Folder already exists!')

Folder already exists!


In [3]:
# load the train dataset
train_file = 'train_dataset_{}.pb'.format(fName)
if os.path.isfile(train_file) and not scratch:
    # just load the train set
    with open(train_file, 'rb') as f:
        train_dataset,trainText,chars = pickle.load(f)
else:
    path = f'../dataset/{data_set}.json'
    files = glob.glob(path)[:maxNumFiles]
    text = processDataFiles(files)

    # chars = sorted(list(set(text)) + ['_', 'T', '<', '>', ':'])
    text = text.split('\n') # convert the raw text to a set of examples
    trainText = text[:-1] if len(text[-1]) == 0 else text

    random.shuffle(trainText) # shuffle the dataset, it's important specailly for the combined number of variables experiment
    length = []
    length_seq = []
    values = []
    for text in trainText:
        text = json.loads(text)
        text_x = text['X'][2]
        len_i = len(text_x)
        text_seq = text['seq']
        text_seq = [x for x in text_seq if x != '']
        len_j = len(text_seq)
        length.append(len_i)
        length_seq.append(len_j)
        if len_i != 0:
            value = np.max(text)
            values.append(value)
    numPoints = np.max(length) + 3
    
    val_size = int(len(trainText) * 0.1)
    val_set = trainText[:val_size]
    test_set = trainText[val_size : 2 * val_size]
    # test_set = trainText[:500]
    train_set = trainText[2 * val_size:]
    train_dataset = CharDataset(train_set, blockSize, numVars=numVars,
                    numYs=numYs, numPoints=numPoints)
    val_dataset = CharDataset(val_set, blockSize, numVars=numVars,
                    numYs=numYs, numPoints=numPoints)
    test_dataset = CharDataset(test_set, blockSize, numVars=numVars,
                    numYs=numYs, numPoints=numPoints)
    # with open(train_file, 'wb') as f:
    #     pickle.dump([train_dataset,trainText,chars], f)


100%|██████████| 1/1 [00:00<00:00, 12.69it/s]


data has 6361 examples
data has 795 examples
data has 795 examples


In [4]:
idx = np.random.randint(train_dataset.__len__())
points, seq = train_dataset.__getitem__(idx)
print('seq:{}'.format(seq))
print('points:{}'.format(points))

seq:tensor([ 2.0000e+02,  7.0123e+00,  9.0141e+00,  2.7145e+01,  9.0002e+01,
         8.9998e+01,  8.9981e+01,  1.7424e+00,  6.5613e-02,  1.0028e+01,
         1.7000e+01,  5.2617e+00,  7.8076e-02,  1.0035e+01,  1.7000e+01,
         7.0090e+00,  6.6258e+00,  1.1571e+01,  1.7000e+01,  2.3537e-02,
         2.3768e+00,  1.1698e+01,  1.7000e+01,  1.6594e+00,  8.9456e+00,
         1.3408e+01,  1.7000e+01,  5.4033e+00,  8.9773e+00,  1.3423e+01,
         1.7000e+01,  1.8799e+00,  4.4453e+00,  1.3718e+01,  1.7000e+01,
         5.1567e+00,  4.4214e+00,  1.3738e+01,  1.7000e+01,  3.5342e+00,
         6.8722e+00,  1.5433e+01,  1.7000e+01,  3.4823e+00,  2.1213e+00,
         1.5594e+01,  1.7000e+01,  1.7453e+00,  4.6124e+00,  1.7095e+01,
         1.7000e+01,  5.2454e+00,  4.5718e+00,  1.7114e+01,  1.7000e+01,
         3.4994e+00,  2.1327e+00,  1.2113e+01,  1.9000e+01,  7.0197e+00,
         6.6372e+00,  1.5022e+01,  1.9000e+01,  1.9593e-02,  8.9853e+00,
         1.1615e+01,  4.1000e+01,  3.5062e+00, 

In [5]:
# create the model
mconf = TransformerConfig(train_dataset.block_size,
                  n_layer=n_layer, n_head=n_head, n_embd=embeddingSize)
model = DD2D(mconf)
# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=numEpochs, batch_size=batchSize, 
                      learning_rate=4e-4,
                      lr_decay=True, warmup_tokens=512*20, 
                      final_tokens=2*len(train_dataset)*blockSize,
                      num_workers=0, ckpt_path=ckptPath)
trainer = Trainer(model, train_dataset, val_dataset, tconf, bestLoss, device=device)
print('The following model {} has been loaded!'.format(ckptPath))

12/30/2024 00:44:26 - INFO - models_new -   number of parameters: 1.538074e+07


The following model ../SavedModels//XYE_2Var_20-350Points_384EmbeddingSize_DD2D_layer_heads_4_4_Padding_NOT_VAR_MINIMIZE.pt has been loaded!


In [6]:
checkpoint = torch.load(ckptPath)
model.load_state_dict(checkpoint)
model = model.eval().to(trainer.device)
loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    pin_memory=True,
    batch_size=len(test_dataset),
    num_workers=0)

In [7]:
for i, (x, y) in enumerate(loader):
    topN = 1
    acc = sample_from_model(model, y, x, topN=topN)
    print('Top {} accuracy: {}'.format(topN, acc))

Top 1 accuracy: 59.874213836477985


In [8]:
for i, (x, y) in enumerate(loader):
    topN = 3
    acc = sample_from_model(model, y, x, topN=topN)
    print('Top {} accuracy: {}'.format(topN, acc))
for i, (x, y) in enumerate(loader):
    topN = 5
    acc = sample_from_model(model, y, x, topN=topN)
    print('Top {} accuracy: {}'.format(topN, acc))

for i, (x, y) in enumerate(loader):
    topN = 10
    acc = sample_from_model(model, y, x, topN=topN)
    print('Top {} accuracy: {}'.format(topN, acc))

Top 3 accuracy: 89.18238993710692
Top 5 accuracy: 96.72955974842768
Top 10 accuracy: 99.74842767295597
