## Import

In [1]:
from utilities._network import *
from utilities._training import *

## Transformer


### Test with Different $\alpha$ and $p$

We choose $\alpha = .2, .3, .4, .5$, $p = 97, 149$

In [8]:
prime, alpha = 97, .4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader, test_loader = CustomDataLoader(alpha, 60, device, prime=prime)
d_model, nhead, d_ff, ntoken = 128, 4, 512, prime
model = Decoder(d_model, nhead, d_ff, ntoken).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=5e-2)

In [None]:
results = train(model=model, train_dataloader=train_loader,test_dataloader=test_loader,optimizer=optimizer,loss_fn=loss_fn, epochs=3000,
                            oriInfo=False, tsneInfo=False, add_test=False, complement=False, tolmax=10)

### Test Different Optimizers

In [None]:
## STD
torch.cuda.manual_seed_all(42)
prime, alpha = 97, .3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader, test_loader = CustomDataLoader(alpha, 60, device, prime=prime)
d_model, nhead, d_ff, ntoken = 128, 4, 512, prime
model = Decoder(d_model, nhead, d_ff, ntoken, dropout=.1).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=5e-2)

In [8]:
torch.cuda.manual_seed_all(42)
prime, alpha = 97, .3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader, test_loader = CustomDataLoader(alpha, 60, device, prime=prime)
d_model, nhead, d_ff, ntoken = 128, 4, 512, prime
model = Decoder(d_model, nhead, d_ff, ntoken).to(device)
loss_fn = nn.CrossEntropyLoss()
#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=0)
#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=0.15)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=.99, weight_decay=0)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=.99, weight_decay=2e-3)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=.99, weight_decay=0, nesterov=True)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=.99, weight_decay=2e-3, nesterov=True)
#optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0, weight_decay=0)
#optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0, weight_decay=1e-4)

In [None]:
results = train(model=model, train_dataloader=train_loader,test_dataloader=test_loader,optimizer=optimizer,loss_fn=loss_fn,
                epochs=500, tolmax=None, 
                oriInfo=False, tsneInfo=False, add_test=False)

## MLP

### Test with Different $\alpha$ and $p$

We choose $\alpha = .3, .4, .5, .6$, $p = 97, 149$

In [10]:
prime, alpha = 97, .6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader, test_loader = CustomDataLoader(alpha, 60, device, prime=prime)
d_model, hidden, ntoken, layers = 128, 512, prime, 3
model = ResMLP(d_model, hidden, ntoken, layers).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=5e-2)

In [None]:
results = train(model=model, train_dataloader=train_loader,test_dataloader=test_loader,optimizer=optimizer,loss_fn=loss_fn, epochs=10000,
                oriInfo=False, tsneInfo=False, add_test=False, complement=False)

## LSTM

### Test with Different $\alpha$ and $p$

We choose $\alpha = .3, .4, .5, .6$, $p = 97, 149$

In [12]:
prime, alpha = 97, .6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader, test_loader = CustomDataLoader(alpha, 50, device, prime=97)
d_model, hidden, ntoken, layers = 128, 512, prime, 2
model = LSTM(d_model, hidden, ntoken, layers).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=5e-2)

In [None]:
results = train(model=model, train_dataloader=train_loader,test_dataloader=test_loader,optimizer=optimizer,loss_fn=loss_fn, epochs=10000,
               oriInfo=False, tsneInfo=False)

## K SUM

In [3]:
prime, alpha, k = 47, .10, 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader, test_loader = KDataLoader(alpha, 512, device, prime=prime, k=k)
d_model, nhead, d_ff, ntoken = 128, 4, 512, prime
model = Decoder(d_model, nhead, d_ff, ntoken, max_len=k+2).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=5e-2)

In [None]:
results = train(model=model, train_dataloader=train_loader,test_dataloader=test_loader,optimizer=optimizer,loss_fn=loss_fn, epochs=10000,
                   oriInfo=False, tsneInfo=False, add_test=False)

In [None]:
prime, alpha, k = 23, .03, 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader, test_loader = KDataLoader(alpha, 512, device, prime=prime, k=k)
d_model, nhead, d_ff, ntoken = 128, 4, 512, prime
model = Decoder(d_model, nhead, d_ff, ntoken, max_len=k+2).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=5e-2)

In [None]:
results = train(model=model, train_dataloader=train_loader,test_dataloader=test_loader,optimizer=optimizer,loss_fn=loss_fn, epochs=10000,
                    oriInfo=False, tsneInfo=False, add_test=False)