In [1]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import torch
print("Using torch", torch.__version__)
import seaborn as sns
import torchvision.datasets as datasets
from torchvision import transforms
sns.set()
torch.manual_seed(42) # Setting the seed
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset

Using torch 2.1.2+cu118


In [2]:
from ac_dll_grammar_vae import print_contact_info
from ac_dll_grammar_vae.data import CFGEquationDataset
from ac_dll_grammar_vae.data.alphabet import alphabet
from ac_dll_grammar_vae.data.transforms import MathTokenEmbedding, RuleTokenEmbedding, ToTensor, Compose, PadSequencesToSameLengthV2, OneHotEncode

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device", device)

# GPU operations have a separate seed we also want to set
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

# Additionally, some operations on a GPU are implemented stochastic for efficiency
# We want to ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Device cuda


## Character VAE

### Character Embedding:
+ Using Math Token Embedding Class

{'+': 1,
 '-': 2,
 '3': 3,
 'sqrt': 4,
 'exp': 5,
 '(': 6,
 'sin': 7,
 '/': 8,
 'x': 9,
 ')': 10,
 '*': 11,
 'log': 12,
 'cos': 13,
 '2': 14,
 '1': 15,
 ' ': 0}

In [4]:
emb = MathTokenEmbedding(alphabet=alphabet)
emb.token_to_idx

{'x': 1,
 '3': 2,
 'sqrt': 3,
 '-': 4,
 '+': 5,
 '*': 6,
 '1': 7,
 '(': 8,
 '/': 9,
 '2': 10,
 'sin': 11,
 'cos': 12,
 'log': 13,
 ')': 14,
 'exp': 15,
 ' ': 0}

In [5]:
#Example Data
data = CFGEquationDataset()
#Example Encoding:
print(f'Example: Data {data[42]}')
encoded_data = emb.embed(data[42])
print(f'Encoded Example: Data {encoded_data}')

Example: Data ['cos', '(', '(', '1', ')', ')', '+', '2']
Encoded Example: Data [12, 8, 8, 7, 14, 14, 5, 10]


+ Example: Data ['cos', '(', '(', '1', ')', ')', '+', '2']
+ Encoded Example: Data [13, 6, 6, 15, 10, 10, 1, 14]

### Creating the Training Dataset using CFG 
 + Class CFGEquationDataset is used to generate the equations

In [6]:
training = CFGEquationDataset(
        n_samples=100000,
        transform=Compose([
            MathTokenEmbedding(alphabet),
            ToTensor(dtype=torch.uint8)
        ]))

#Batch Size:
batch_size = 100
MAX_SEQ_LEN = 21
collate_fn = PadSequencesToSameLengthV2(padding_value=0, max_length=21)
training_loader = DataLoader(dataset=training,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=PadSequencesToSameLengthV2(padding_value=0, max_length=21))

#### Convert the Embedded equations into one hot encoded vectors

In [7]:
onehotencoder = OneHotEncode(alphabet)
one_hot_encoded_training = []
for batch in training_loader:
  try:
    one_hot_encoded_batch = onehotencoder(batch)
  except Exception as e:
    print(batch)
    continue
  one_hot_encoded_training.append(one_hot_encoded_batch.numpy())
one_hot_encoded_training = np.array(one_hot_encoded_training)
one_hot_encoded_training_tensor = torch.Tensor(one_hot_encoded_training)
one_hot_encoded_training_tensor = one_hot_encoded_training_tensor.view(one_hot_encoded_training_tensor.shape[0]*one_hot_encoded_training_tensor.shape[1],one_hot_encoded_training_tensor.shape[2],one_hot_encoded_training_tensor.shape[3])
one_hot_encoded_training_tensor.shape

torch.Size([100000, 21, 16])

### Creating final dataloader for model which is one-hot-encoded:
 + Setting Maximum equation length to 21

In [8]:
batch_size = 100
MAX_SEQ_LEN = 21
alphabet_length = len(alphabet) + 1
one_hot_encoded_training_loader = DataLoader(dataset=one_hot_encoded_training_tensor,
                              batch_size=batch_size,
                              shuffle=False)

### Model Initialization for Character AE

In [20]:
from models import EqnAE
# Model Initialization
model = EqnAE(alphabet_length,MAX_SEQ_LEN)
model.to(device)
#Lossa
BCELoss = nn.BCELoss(reduction="sum")
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
print(model)

EqnAE(
  (conv1): Conv1d(21, 2, kernel_size=(2,), stride=(1,))
  (conv2): Conv1d(2, 3, kernel_size=(3,), stride=(1,))
  (conv3): Conv1d(3, 4, kernel_size=(4,), stride=(1,))
  (bn1): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn4): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=40, out_features=100, bias=True)
  (fc_latent): Linear(in_features=100, out_features=10, bias=True)
  (rev_latent): Linear(in_features=10, out_features=100, bias=True)
  (gru1): GRU(100, 100, batch_first=True)
  (gru2): GRU(100, 100, batch_first=True)
  (gru3): GRU(100, 100, batch_first=True)
  (fc_final): Linear(in_features=100, out_features=16, bias=True)
  (time_distributed): Linear(in_features=100, out_features=16, bias=True)
  (softmax):

In [29]:
from train import train_EqnAE
train_EqnAE(model, one_hot_encoded_training_loader,BCELoss, optimizer, 30)

In [35]:
# !!!! Be careful with saving:
torch.save(model.state_dict(), './saved/models/EQN_AE_BCE_Loss.pth')

In [21]:
model = EqnAE(alphabet_length, MAX_SEQ_LEN)
model.load_state_dict(torch.load('./saved/models/EQN_AE_BCE_Loss.pth'))
model.to(device)

EqnAE(
  (conv1): Conv1d(21, 2, kernel_size=(2,), stride=(1,))
  (conv2): Conv1d(2, 3, kernel_size=(3,), stride=(1,))
  (conv3): Conv1d(3, 4, kernel_size=(4,), stride=(1,))
  (bn1): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn4): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=40, out_features=100, bias=True)
  (fc_latent): Linear(in_features=100, out_features=10, bias=True)
  (rev_latent): Linear(in_features=10, out_features=100, bias=True)
  (gru1): GRU(100, 100, batch_first=True)
  (gru2): GRU(100, 100, batch_first=True)
  (gru3): GRU(100, 100, batch_first=True)
  (fc_final): Linear(in_features=100, out_features=16, bias=True)
  (time_distributed): Linear(in_features=100, out_features=16, bias=True)
  (softmax):

### Creating the Test Dataset and Dataloader:

In [22]:
test_dataset = CFGEquationDataset(
        n_samples=1000,
        transform=Compose([
            MathTokenEmbedding(alphabet),
            ToTensor(dtype=torch.uint8)
        ]))

#Batch Size:
batch_size = 100
MAX_SEQ_LEN = 21
collate_fn = PadSequencesToSameLengthV2(padding_value=0, max_length=21)
test_loader = DataLoader(dataset=test_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=PadSequencesToSameLengthV2(padding_value=0, max_length=21))

In [23]:
one_hot_encoded_testing = []
for batch in test_loader:
  try:
    one_hot_encoded_batch = onehotencoder(batch)
  except Exception as e:
    print(batch)
    continue
  #print(one_hot_encoded_batch.shape)
  one_hot_encoded_testing.append(one_hot_encoded_batch.numpy())
one_hot_encoded_testing = np.array(one_hot_encoded_testing)
one_hot_encoded_testing_tensor = torch.Tensor(one_hot_encoded_testing)
one_hot_encoded_testing_tensor = one_hot_encoded_testing_tensor.view(one_hot_encoded_testing_tensor.shape[0]*one_hot_encoded_testing_tensor.shape[1],one_hot_encoded_testing_tensor.shape[2],one_hot_encoded_testing_tensor.shape[3])
one_hot_encoded_testing_tensor.shape
one_hot_encoded_testing_loader = DataLoader(dataset=one_hot_encoded_testing_tensor,
                              batch_size=batch_size,
                              shuffle=False)

### Understanding and visualizing some outputs:
+ One hot decode into embedding and then use idxtotoken to convert to equations:

In [28]:
model.eval()

recon_batch_iter = iter(one_hot_encoded_testing_loader)
recon_batch = next(recon_batch_iter)
one_hot_decoded = []
one_hot_decoded_recon = []
for sample in recon_batch_iter:
  sample = sample.to(device)
  recon = model(sample)
  for idx,ele in enumerate(recon):
    max_indices = torch.argmax(ele, dim=1)
    one_hot = torch.zeros_like(ele) 
    one_hot[torch.arange(ele.size(0)), max_indices] = 1
    embd = torch.argmax(one_hot, dim=1)
    one_hot_decoded.append(emb.decode(torch.argmax(sample[idx], dim=1)))
    one_hot_decoded_recon.append(emb.decode(embd))
    break

for idx, ele in enumerate(one_hot_decoded_recon[:10]):
  print('Actual Equation:', ''.join(one_hot_decoded[idx]))
  print('AE Decoded Equation:', ''.join(one_hot_decoded_recon[idx]),'\r\n')

Actual Equation: cos(1)-(3)+x           
AE Decoded Equation: cos21sqrt-/log(x))expexpsinsinsinsinsinsinsinsin 

Actual Equation: 1*3                  
AE Decoded Equation: 1log3x(**cossincos222222sqrtsqrtsqrtsqrtsqrt 

Actual Equation: 1                    
AE Decoded Equation: -(3/cossqrt*sqrtsqrtsqrt+++++++++++ 

Actual Equation: 1                    
AE Decoded Equation: -(3/cossqrt*sqrtsqrtsqrt+++++++++++ 

Actual Equation: 1                    
AE Decoded Equation: -(3/cossqrt*sqrtsqrtsqrt+++++++++++ 

Actual Equation: 1*3                  
AE Decoded Equation: 1log3x(**cossincos222222sqrtsqrtsqrtsqrtsqrt 

Actual Equation: 3                    
AE Decoded Equation: 3(2222    expexpexpexpexpexpexp**** 

Actual Equation: sin(1/x-2)             
AE Decoded Equation: sinlogcosxsqrt3x+//2) ) **expexpexpexp 

Actual Equation: 1                    
AE Decoded Equation: -(3/cossqrt*sqrtsqrtsqrt+++++++++++ 



### Visualizing the latent space by PCA(2D) for some sample equations:

In [None]:
from visualize import visualize_latent_space_Eqn

### Character VAE Model

In [9]:
from models import EqnVAE
from train import train_EqnVAE

Device cuda


### Model Initialization for Character VAE

In [10]:
# Model Initialization
alphabet_length = len(alphabet) + 1
model = EqnVAE(alphabet_length,MAX_SEQ_LEN)
model.to(device)
#Loss
vaeLoss = model.vae_loss
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-03)
print(model)

EqnVAE(
  (conv1): Conv1d(21, 2, kernel_size=(2,), stride=(1,))
  (conv2): Conv1d(2, 3, kernel_size=(3,), stride=(1,))
  (conv3): Conv1d(3, 4, kernel_size=(4,), stride=(1,))
  (bn1): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn4): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=40, out_features=100, bias=True)
  (fc_mean): Linear(in_features=100, out_features=10, bias=True)
  (fc_logvar): Linear(in_features=100, out_features=10, bias=True)
  (rev_latent): Linear(in_features=10, out_features=100, bias=True)
  (gru1): GRU(100, 100, batch_first=True)
  (gru2): GRU(100, 100, batch_first=True)
  (gru3): GRU(100, 100, batch_first=True)
  (fc_final): Linear(in_features=100, out_features=16, bias=True)
  (time_distributed)

In [12]:
train_EqnVAE(model,one_hot_encoded_training_loader,vaeLoss,optimizer,num_epochs=30)

====> Epoch: 0 Average loss: 1492.55557656
====> Epoch: 1 Average loss: 1471.09773828
====> Epoch: 2 Average loss: 1458.59959500
====> Epoch: 3 Average loss: 1450.24223125
====> Epoch: 4 Average loss: 1444.21629000
====> Epoch: 5 Average loss: 1439.44751872
====> Epoch: 6 Average loss: 1435.45321866
====> Epoch: 7 Average loss: 1432.06335244
====> Epoch: 8 Average loss: 1429.05706946
====> Epoch: 9 Average loss: 1426.31681716
====> Epoch: 10 Average loss: 1423.85109236
====> Epoch: 11 Average loss: 1421.58311964
====> Epoch: 12 Average loss: 1419.47475049
====> Epoch: 13 Average loss: 1417.49201887
====> Epoch: 14 Average loss: 1415.67986107
====> Epoch: 15 Average loss: 1413.94596692
====> Epoch: 16 Average loss: 1412.32364415
====> Epoch: 17 Average loss: 1410.80080154
====> Epoch: 18 Average loss: 1409.34327565
====> Epoch: 19 Average loss: 1407.96357970
====> Epoch: 20 Average loss: 1406.66743105
====> Epoch: 21 Average loss: 1405.41452796
====> Epoch: 22 Average loss: 1404.2995112

1397.469343388021

In [14]:
# !!!! Be careful with saving:
torch.save(model.state_dict(), './saved/models/EQN_VAE_BCE_Loss.pth')

In [15]:
model = EqnVAE(alphabet_length,MAX_SEQ_LEN)
model.load_state_dict(torch.load('./saved/models/EQN_VAE_BCE_Loss.pth'))
model.to(device)

EqnVAE(
  (conv1): Conv1d(21, 2, kernel_size=(2,), stride=(1,))
  (conv2): Conv1d(2, 3, kernel_size=(3,), stride=(1,))
  (conv3): Conv1d(3, 4, kernel_size=(4,), stride=(1,))
  (bn1): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn4): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=40, out_features=100, bias=True)
  (fc_mean): Linear(in_features=100, out_features=10, bias=True)
  (fc_logvar): Linear(in_features=100, out_features=10, bias=True)
  (rev_latent): Linear(in_features=10, out_features=100, bias=True)
  (gru1): GRU(100, 100, batch_first=True)
  (gru2): GRU(100, 100, batch_first=True)
  (gru3): GRU(100, 100, batch_first=True)
  (fc_final): Linear(in_features=100, out_features=16, bias=True)
  (time_distributed)

### Creating the Test Dataset and Dataloader:

In [16]:
test_dataset = CFGEquationDataset(
        n_samples=1000,
        transform=Compose([
            MathTokenEmbedding(alphabet),
            ToTensor(dtype=torch.uint8)
        ]))

#Batch Size:
batch_size = 100
MAX_SEQ_LEN = 21
collate_fn = PadSequencesToSameLengthV2(padding_value=0, max_length=21)
test_loader = DataLoader(dataset=test_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=PadSequencesToSameLengthV2(padding_value=0, max_length=21))

In [17]:
one_hot_encoded_testing = []
for batch in test_loader:
  try:
    one_hot_encoded_batch = onehotencoder(batch)
  except Exception as e:
    print(batch)
    continue
  #print(one_hot_encoded_batch.shape)
  one_hot_encoded_testing.append(one_hot_encoded_batch.numpy())
one_hot_encoded_testing = np.array(one_hot_encoded_testing)
one_hot_encoded_testing_tensor = torch.Tensor(one_hot_encoded_testing)
one_hot_encoded_testing_tensor = one_hot_encoded_testing_tensor.view(one_hot_encoded_testing_tensor.shape[0]*one_hot_encoded_testing_tensor.shape[1],one_hot_encoded_testing_tensor.shape[2],one_hot_encoded_testing_tensor.shape[3])
one_hot_encoded_testing_tensor.shape
one_hot_encoded_testing_loader = DataLoader(dataset=one_hot_encoded_testing_tensor,
                              batch_size=batch_size,
                              shuffle=False)

### Understanding and visualizing some outputs:
+ One hot decode into embedding and then use idxtotoken to convert to equations:

In [18]:
model.eval()

recon_batch_iter = iter(one_hot_encoded_testing_loader)
recon_batch = next(recon_batch_iter)
one_hot_decoded = []
one_hot_decoded_recon = []
for sample in recon_batch_iter:
  sample = sample.to(device)
  recon, _, _ = model(sample)
  for idx,ele in enumerate(recon):
    max_indices = torch.argmax(ele, dim=1)
    one_hot = torch.zeros_like(ele) 
    one_hot[torch.arange(ele.size(0)), max_indices] = 1
    embd = torch.argmax(one_hot, dim=1)
    one_hot_decoded.append(emb.decode(torch.argmax(sample[idx], dim=1)))
    one_hot_decoded_recon.append(emb.decode(embd))
    break

for idx, ele in enumerate(one_hot_decoded_recon[:10]):
  print('Actual Equation:', ''.join(one_hot_decoded[idx]))
  print('VAE Decoded Equation:', ''.join(one_hot_decoded_recon[idx]),'\r\n')

Actual Equation: x                    
VAE Decoded Equation: xexp*cos --2222222222//// 

Actual Equation: 1-1+3                
VAE Decoded Equation: 1322+xxxxx x///////// 

Actual Equation: cos(1)                 
VAE Decoded Equation: x(1sqrtexpsin+*2*2+))))))))2 

Actual Equation: 1                    
VAE Decoded Equation: 1((-------**logloglogloglogloglogloglog 

Actual Equation: 3                    
VAE Decoded Equation: exp sqrtsqrtsqrtsqrtsqrtsqrtsqrt---111111111 

Actual Equation: 3-2+3+3              
VAE Decoded Equation: *(1sqrt2x++3---///////)) 

Actual Equation: 1                    
VAE Decoded Equation: 1sin22222 333xxxxloglogloglogloglog 

Actual Equation: 3                    
VAE Decoded Equation: exp/sin((coscoscoscos--1111xxxxxx 

Actual Equation: x                    
VAE Decoded Equation: xlog-cossinsinsin----   sqrtsqrtsqrtsqrtsqrtsqrtsqrt 



### Visualizing the latent space by PCA(2D) for some sample equations:

In [26]:
from visualize import visualize_latent_space_Eqn
model.to(device)
visualize_latent_space_Eqn(model, one_hot_encoded_training_loader)

AttributeError: 'DataLoader' object has no attribute 'to'

## Grammar VAE