### Deep learning model time!!!

<img src = pics/OIP.jpg width = 400>

In [1]:
import cupy
import numpy as np
import math
import time
import torch
cupy.cuda.set_allocator(None)       # no clue
from torch.utils.dlpack import from_dlpack

import numba
from numba import cuda

import os, os.path
from pathlib import Path

from datetime import datetime

### Creating dataset
Each monte carlo simulation run is equivalent to one data point being made, so to generate a large dataset, we have to run monte carlo simulations lots of times, and batches can hypotheitcally make doing this faster.

here,the mc model from mc_snow, cuda version was imported and cleaned up a bit.
note that due to the existence of batches, some of the varaibles now need a bit of extra finagling to access properly. (s_0, Ki, Ko, mu, sigma, pot,r, d_normals, snowball_path_holder). Overall design is very close to original, though.

In [7]:
@cuda.jit               # defualt GPU
def monte_carlo_andtheholygrail_gpu(d_s, s_0, Ki, Ko, mu, sigma, pot,r,
                                    d_normals, snowball_path_holder, MONTHS,
                                    N_STEPS, N_PATHS, N_BATCH):
    

    # for shared memory (non)optimization
    # shared = cuda.shared.array(shape=0, dtype=numba.float32)
    # # load to shared memory
    # path_offset = cuda.blockIdx.x * cuda.blockDim.x

    # ii - overall thread index
    ii = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    stride = cuda.gridDim.x * cuda.blockDim.x

    for n in range(ii, N_PATHS * N_BATCH, stride):
        # newly added vars for N_BATCH calculations
        batch_id = n // N_PATHS
        path_id = n % N_PATHS       # equivalent to n in old code 

        snowball_path_holder[n][0] = s_0[batch_id]
        earlyexit = False
        ki = False
        mald = False
        for t in range(N_STEPS):
            # pre shared memory b_motion    
            #                                                   
            b_motion = d_normals[path_id + batch_id * N_PATHS +  t * N_PATHS * N_BATCH]

            # post shared memory b_motion
            # shared[cuda.threadIdx.x] = d_normals[path_offset + cuda.threadIdx.x + t * N_PATHS]

            dt = 1/N_STEPS
            # pre shared memory b_motion
            ds = snowball_path_holder[n][t] * mu[batch_id] * dt + snowball_path_holder[n][t] \
                                                * sigma[batch_id] * b_motion * math.sqrt(dt) 
            # post shared memory b_motion
            # ds = snowball_path_holder[n][t] * mu[batch_id] * dt + snowball_path_holder[n][t] * sigma[batch_id] * shared[cuda.threadIdx.x] * math.sqrt(dt) 
                    # no adjusting list sizes in cuda :(
            # snowball_path.append(snowball_path[t]+ds)
            snowball_path_holder[n][t+1] = snowball_path_holder[n][t] + ds
            

            # ki = snowball_path[t] + ds
            if snowball_path_holder[n][t+1] <= Ki[batch_id]:
                ki = True

            if not mald:
                for month in (0,1,2,3,4,5,6,7,8,9,10,11):                # need to do this instead because contains (in) and range are disabled
                    if t+1 == MONTHS[month]:     #startday no longer used to fake a start date in code
                        # price = t+1+startday
                        if snowball_path_holder[n][t+1] >= Ko[batch_id]:
                            price =  pot[batch_id] * t/365     # should turn t into int
                            # return snowball_path, price
                            d_s[n] =  price * math.exp(-r[batch_id] * t/N_STEPS)   # accounting for r
                            snowball_path_holder[n][-1] = d_s[n]            
                            earlyexit = True
                            mald = True
                            # print("blo got fucked\n")
                            break
            else: # if mald
                break
        
        if not earlyexit:       # to prevent early exit getting out of bdds error
            # did not get knocked up or down
            price = pot[batch_id]
            # t  =T 
                        # CAN'T USE T CUZ CUDA IS FUCKING SHIT so use -1 instead
                        # or not ig T works now :sob:
            if ki and snowball_path_holder[n][N_STEPS] <= s_0[batch_id]:          # blo got knocked down and never recovered
                price = snowball_path_holder[n][N_STEPS] - s_0[batch_id]
            elif ki and snowball_path_holder[n][N_STEPS] <= Ko[batch_id]:          # blo got knocked down for a bit but finished above Ki
                price =0
            d_s[n] = price * math.exp(-r[batch_id])
            snowball_path_holder[n][-1] = d_s[n]    




However, for some reason, increasing the number of paths and/or the number of batches greatly slows down my monte's carlo's computational speed. I have no definitive proof that this is the case, but I strongly belive it to be because threads are becoming unsynched as the code runs on, making both greater paths and greater batches than my current settings have much slower run times than their current values. Not that my current code isn't slower than it should be, either. <br>
 Max len controls the number of data points, path controls how accurate each data point is. 
 

Once finished running, data is saved into a directory.
<br>
If you think running a large number like 1 mil mcs takes way too long, throw these first three cells into a python file (datasetgen.py) and let it run in its own terminal while going forward in the notebook with a smaller set for test purposes.

for demonstration purposes, only a small amount of data is generated here. If large amounts of data are to be generated, **PLEASE** go to datasetgen.py instead. there is much more stuff there that isnt incorporated here since i dont like scrolling htat much that would make generating data a bit easier (generates data in chunks so that its safer, ability to run mutliple process of program at the same time thru currnumm)  

In [8]:
#               make sure max_len is large enough or else divide by zero error occurs (at least 100 batches must be run)
limiter = True
# max_len = 1000000                 # hundo thousand data points, final speed is ~40 min for 1000 data.
max_len = 10                 # hundo thousand data points, final speed is ~40 min for 1000 data.
number_path = 500000
batch = 1
threads = 256
seed  =1999 
num = 0
max_length = max_len
N_PATHS = number_path
N_STEPS = 365
N_BATCH  =batch

max_length = max_length // N_BATCH
percenter  =100
percent = max_length // percenter

#           uncomment if u want less batches, the percent will just be wrong
if percent == 0:
    percent = 1

# we will not be calculating a starting date since the difference is negligible and I aint rigging up
# a system to check if a certain day is a weekend or not
MONTHS = cupy.asnumpy([0, 31,59,90,120,151,181,212,243, 273,304,334])
        # SHOULD THIS BE NP ARRAY INSTEAD????
snowball_path_holder =  np.zeros(N_BATCH*N_PATHS, dtype=(np.float32,N_STEPS+1))# extra 1 is no longer for storing payoff
# self.snowball_path_holder = cupy.array(self.snowball_path_holder)
# self.T  = np.float(365.0)         # nah id lose. 
output = cupy.zeros(N_BATCH*N_PATHS, dtype = cupy.float32)
num_blocks  =(N_PATHS * N_BATCH -1) // threads +1
num_threads = threads

# torch.cuda._sanitizer.enable_cuda_sanitizer()
# Xs =  np.zeros(max_length, dtype=(np.float32,7))#
# Ys =  np.zeros(max_length, dtype=(np.float32))#          storing final data

Xss = []
Yss = []
# Xss = np.zeros((max_length, 7))
# Yss = np.zeros((max_length, 1))
currnum = len(os.listdir('snow_data_tensor_train'))//2+1
print("Adding files starting from", currnum)

print("Num batches:", N_BATCH)

# making sure self.snowball_path_holder is zeroed to avoid bug
# self.snowball_path_holder.fill(0)
s = time.time()

for i in range(1,max_length+1):
        randoms = cupy.random.normal(0,1, N_BATCH * N_PATHS * N_STEPS, dtype= cupy.float32)

        Xpre = cupy.random.rand(N_BATCH, 7, dtype = cupy.float32)
        #                        s_0,  Ki, Ko,  mu, sigma, pot, r
        Xpre = Xpre * cupy.array([4,  -2,  1,  .01,  .15,  10, .01], dtype=cupy.float32)
        X = Xpre +    cupy.array([8,   0,  0,  .02, .275,  15, .02], dtype=cupy.float32)
        # Ki and Ko will be set down here instead of the previous line to make them relative to s_0.
        X[:, 1] = X[:,0] -1         # overriding Ki and Ko 
        X[:, 2] = X[:,0] -.2        
        X[:, 1] += Xpre[:,1]        # adding back the offset in Xpre after it gets overrided
        X[:, 2] += Xpre[:,2] 

        snowball_path_holder.fill(0)
                                        # d_s, s_0, Ki, Ko, mu, sigma, pot,r,
                                        # d_normals, snowball_path_holder, MONTHS,
                                        # N_STEPS, N_PATHS, N_BATCH):
        monte_carlo_andtheholygrail_gpu[(num_blocks,), (num_threads,)](
                                        output, X[:, 0], X[:, 1], X[:, 2], X[:, 3], 
                                        X[:, 4], X[:, 5], X[:, 6],
                                        randoms, snowball_path_holder, MONTHS,
                                        N_STEPS, N_PATHS, N_BATCH)
        # o = output.reshape(N_BATCH, N_PATHS)
        # Y  =o.mean(axis =1)         # getting the average of each batch
        Y = output.mean()
        # Y = output
        X = X.mean(axis=0)
        Xss.append(X.tolist())
        Yss.append(Y.tolist())
        print(Yss)
        # Xss.append(X)
        # Yss.append(Y)

        # have following turned off. go to datasetgen.py for a better view.
        # if(i%percent==0):
        #     if limiter:
        #         if currnum > percenter:
        #             print("premature exit, burunyu~")
        #             break
        #     e = time.time()
        #     print(i/(percent), "percent of the way there! Time is now:", (e-s)/60/60, "hours")
        #     # print(i/(percent*10), "percent of the way there! Time is now:", e-s, "secs")
        #     print("now saving tsnowX_{}.pt".format(currnum) )
        #     tensorX = np.array(Xss)
        #     tensorY = np.array(Yss)
        #     tensorX = torch.Tensor(tensorX)
        #     tensorY = torch.Tensor(tensorY)
        #     torch.save(tensorX, f"snow_data_tensor_train/tsnowX_{currnum}.pt")
        #     torch.save(tensorY, f"snow_data_tensor_train/tsnowY_{currnum}.pt")
        #     Xss.clear()
        #     Yss.clear()
        #     currnum += 1

        num+=1          #actually useless
        # print((from_dlpack(X.toDlpack()), from_dlpack(Y.toDlpack()))) 
        
        Xss.clear()
        Yss.clear()

v = output.mean()
cuda.synchronize()
e = time.time()
print('time', e-s, 'v', v, 'avg time', (e-s)/500000)

# Xs = np.array(Xss)
# Ys = np.array(Yss)
Xss = np.array(Xss)
Yss = np.array(Yss)
# print(Xss)
print(Yss)

tensorX = torch.Tensor(Xss)
tensorY = torch.Tensor(Yss)
print(tensorX)
print(tensorY)

## i have following turned off but feel free to do stuff with it

# torch.save(tensorX, "snow_data_tensor_train/tsnowX.pt")
# torch.save(tensorY, "snow_data_tensor_train/tsnowY.pt")

Adding files starting from 101
Num batches: 1




[4.882806777954102]
[2.3875019550323486]
[5.110105991363525]
[2.0633299350738525]
[2.978761672973633]
[2.733570098876953]
[2.55251407623291]
[2.243455648422241]
[2.852839469909668]
[3.885261058807373]
time 2.7585599422454834 v 3.885261 avg time 5.517119884490967e-06
[]
tensor([])
tensor([])


### A small tangent on datasetgen
So... after a lot of finagling with datasetgen.py, I have realized that it is most certianly the case that the code runs slower due to needing to run 500000 paths before it can synch back up again, causing great slowdown, espeicaly with larger batch/path numbers. However, this does not prevent us from running multiple different processes of the same code, filling up the gpu with power of more processes instead. Though there is some slowdown caused by having more processes, there is an almost 2 times improvement over runnign one process of the code when using 3 processes. Thats a lot! Of course, if you can get more processes runnign without gpu's memory going to 100% and locking up the program for extended periods of time, this should allow for as much speedup as you would get by using all of your gpu???? I think???? Ive been runnign datasetgen in its own power shells to prevent restarts of vs code from restarting it (pylance keeps crashing >:( ), but this can obviously be turned all into a single program that creates as many processes as you want! Therefore there is also datasetgen_multi.py which can do exactly that! its a slight modification of datasetgen.py but is a big quality of life update, greatly lowering the amount of power shells i need to open every time i run the program (u cant close em while they are runnign since they are locked in and dont like to stop at keyboard interrupts)

<img src = pics\big_scary_hacker_man.jpg width = 700> <br>

**Fig 1. Me being big scary hacker man 😱** (6 process was a mistake and now all of them are frozen)

### Back to regularly scheduled programming (loading dataset made by datasetgen.py)
Loading the data to confirm its existence! turning it into gpu monsters to prepare for throwing it into model to train! can prolly be done be done before saving, thouhg :/ <br>
oh well, it takes like no time to do

In [34]:
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import transforms, utils, datasets
import os

path = "snow_data_tensor"

def npy_loader(path):
    sample = torch.load(path)
    return sample

# dataset = datasets.DatasetFolder(
#     root=path,
#     loader=npy_loader,
#     extensions=['.pt']
#     )
# tensor_x = torch.Tensor(Xss)
# tensor_y = torch.Tensor(Yss)

finnum = len(os.listdir("snow_data_tensor_train"))//2+1          # equivalent to currnum of a previous box

tensor_x_L = torch.load("snow_data_tensor_train/tsnowX_1.pt")
tensor_y_L = torch.load("snow_data_tensor_train/tsnowY_1.pt")


for tensor_num  in range(2, finnum):
    tensor_x_R = torch.load(f"snow_data_tensor_train/tsnowX_{tensor_num}.pt")
    tensor_y_R = torch.load(f"snow_data_tensor_train/tsnowY_{tensor_num}.pt")   

    #   cat left side with right side (kiara and kamma???)
    tensor_x_L = torch.cat((tensor_x_L, tensor_x_R), 0)
    tensor_y_L = torch.cat((tensor_y_L, tensor_y_R), 0)




print(tensor_x_L)
print(tensor_y_L)

# print(tdataset)

# for i in tdataset:
# #     # print(i, "\n")
#     print(i)     # printing the Ys
# print(len(dataset)/2)
# print(dataset)
# inputs = tensor_x.float().to(device)
# # values = tensor_y.float().to(device)
# print(f"Input device is : cuda:{tensor_x.get_device()}")
# print(f"Target value device is : cuda:{tensor_y.get_device()}")

tensor([[10.7849,  9.7630, 10.9411,  ...,  0.4168, 18.8266,  0.0270],
        [11.2135,  8.3553, 11.3719,  ...,  0.2971, 17.4514,  0.0261],
        [ 8.1927,  6.3303,  8.4115,  ...,  0.4142, 20.6274,  0.0260],
        ...,
        [ 8.5286,  5.6047,  8.3977,  ...,  0.4016, 19.3757,  0.0229],
        [10.5141,  7.9688, 10.7822,  ...,  0.3437, 22.5634,  0.0237],
        [11.9229,  9.6655, 12.6072,  ...,  0.2978, 19.4558,  0.0234]])
tensor([2.2016, 2.8075, 2.7976,  ..., 2.7598, 3.3981, 3.1653])


we can also split the data into more train, validation, and test before saving once more

In [41]:

num_data = tensor_x_L.size()[0]
num_train = int(num_data*.6)
num_val = int(num_data*.8)

train_x, val_x, test_x = torch.tensor_split(tensor_x_L, (num_train, num_val))
train_y, val_y, test_y = torch.tensor_split(tensor_y_L, (num_train, num_val))

# print(train.size())


dir_p = "snow_data_processed"

torch.save(train_x, f"{dir_p}\\train_x.pt")
torch.save(val_x, f"{dir_p}\\val_x.pt")
torch.save(test_x, f"{dir_p}\\test_x.pt")

torch.save(train_y, f"{dir_p}\\train_y.pt")
torch.save(val_y, f"{dir_p}\\val_y.pt")
torch.save(test_y, f"{dir_p}\\test_y.pt")


Now to load the processed data.

In [None]:
dir_p = "snow_data_processed"


train_x = torch.load(f"{dir_p}\\train_x.pt")
train_y = torch.load(f"{dir_p}\\train_y.pt")


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
inputs = train_x.float().to(device)
values = train_y.float().to(device)


tdataset = TensorDataset(inputs,values) # create your datset

Now for some reason, the guy who made the nvidia notebook decided it was a good idea to make the dataset generate itself while training, making the training process significantly slower than if the data was already prepared already. This works, but is not that good. I would not reccomend, 2/5 stars. <br>
edit: he does it the better way in their next notbook :/<br>
still leaving this in since its interesting to look at tho. 

In [4]:

class SnowballDataSet(object):

    def __init__(self, max_len = 10, number_path = 1000, batch = 2, threads = 512, seed  =1999 ):
        self.num = 0
        self.max_length = max_len
        self.N_PATHS = number_path
        self.N_STEPS = 365
        self.N_BATCH  =batch
        # we will not be calculating a starting date since the difference is negligible and I aint rigging up
        # a system to check if a certain day is a weekend or not
        self.MONTHS = cupy.asnumpy([0, 31,59,90,120,151,181,212,243, 273,304,334])
                # SHOULD THIS BE NP ARRAY INSTEAD????
        self.snowball_path_holder =  np.zeros(self.N_BATCH*self.N_PATHS, dtype=(np.float32,self.N_STEPS+1))# extra 1 is no longer for storing payoff
        # self.snowball_path_holder = cupy.array(self.snowball_path_holder)
        # self.T  = np.float(365.0)         # nah id lose. 
        self.output = cupy.zeros(self.N_BATCH*self.N_PATHS, dtype = cupy.float32)
        self.num_blocks  =(self.N_PATHS * self.N_BATCH -1) // threads +1
        self.num_threads = threads

        #  temp_months, snowball_path_holder both added now
        cupy.random.seed(seed)

    def __len__(self):
        return self.max_length

    def __iter__(self):
        self.num = 0
        return self

    #   next basically takes the place of the cell running the mc. As such need to generate
     # (d_s, s_0, Ki, Ko, mu, sigma,  pot,r, d_normals, snowball_path_holder, MONTHS, N_STEPS, N_PATHS, N_BATCH
     # note that all but s_0, Ki, Ko, mu, sigma,  pot,r, d_normals have been generated in init due to their nonrandom nature
    def __next__(self):
        if self.num > self.max_length: 
            raise StopIteration      # nvidia notebook uses raise StopIteration here but p sure its deprecated???
                                      # is used because return returns an extra None
        # generating the variables
        # d_normals
        randoms = cupy.random.normal(0,1, self.N_BATCH * self.N_PATHS * self.N_STEPS, dtype= cupy.float32)

        Xpre = cupy.random.rand(self.N_BATCH, 7, dtype = cupy.float32)
        #                        s_0,  Ki, Ko,  mu, sigma, pot, r
        Xpre = Xpre * cupy.array([4,  -2,  1,  .01,  .15,  10, .01], dtype=cupy.float32)
        X = Xpre +    cupy.array([8,   0,  0,  .02, .275,  15, .02], dtype=cupy.float32)
        
        # Ki and Ko will be set down here instead of the previous line to make them relative to s_0.
        X[:, 1] = X[:,0] -1         # overriding Ki and Ko 
        X[:, 2] = X[:,0] -.2        
        # print(X)
        X[:, 1] += Xpre[:,1]        # adding back the offset in Xpre after it gets overrided
        X[:, 2] += Xpre[:,2] 

        # making sure self.snowball_path_holder is zeroed to avoid bug
        self.snowball_path_holder.fill(0)

                                        # d_s, s_0, Ki, Ko, mu, sigma, pot,r,
                                        # d_normals, snowball_path_holder, MONTHS,
                                        # N_STEPS, N_PATHS, N_BATCH):
        monte_carlo_andtheholygrail_gpu[(self.num_blocks,), (self.num_threads,)](
                                        self.output, X[:, 0], X[:, 1], X[:, 2], X[:, 3], 
                                        X[:, 4], X[:, 5], X[:, 6],
                                        randoms, self.snowball_path_holder, self.MONTHS,
                                        self.N_STEPS, self.N_PATHS, self.N_BATCH)
        
        o = self.output.reshape(self.N_BATCH, self.N_PATHS)
        Y  =o.mean(axis =1)         # getting the average of each batch
        self.num+=1
        return (from_dlpack(X.toDlpack()), from_dlpack(Y.toDlpack()))



And now a small test run.

In [30]:
# ds = SnowballDataSet(10, number_path=500000, batch=16, seed=15)
ds = SnowballDataSet(10, number_path=500000, batch=1, seed=15)
for i in ds:
    # print(i, "\n")
    print(i[0],i[1])     # printing the Ys

NameError: name 'SnowballDataSet' is not defined

### Creating the model

Erm pretty default model. Just making it have functionality. normalizing it accoriding to the average value of all of the input variables.

In [31]:
%%writefile snow_model.py
import torch.nn as nn
import torch.nn.functional as F
import torch


class Net(nn.Module):

    def __init__(self, hidden=1024):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(7, hidden)
        self.fc2 = nn.Linear(hidden, hidden)
        self.fc3 = nn.Linear(hidden, hidden)
        self.fc4 = nn.Linear(hidden, hidden)
        self.fc5 = nn.Linear(hidden, hidden)
        self.fc6 = nn.Linear(hidden, hidden)
        self.fc7 = nn.Linear(hidden, 1)
        self.register_buffer('norm',
                             torch.tensor([10.0,
                                           8.5,
                                           10.4,
                                           0.025,
                                           0.35,
                                           0.20,
                                           0.025]))

    def forward(self, x):
        # normalize the parameter to range [0-1] 
        x = x / self.norm
        x = F.elu(self.fc1(x))
        x = F.elu(self.fc2(x))
        x = F.elu(self.fc3(x))
        x = F.elu(self.fc4(x))
        x = F.elu(self.fc5(x))
        x = F.elu(self.fc6(x))
        return self.fc7(x)


Overwriting snow_model.py


### Running the model
as the amount of data is relatively small, a smallish batch size will be used 

In [33]:
from ignite.engine import Engine, Events
from ignite.handlers import Timer
from torch.nn import MSELoss
from torch.optim import Adam
from ignite.contrib.handlers.param_scheduler import CosineAnnealingScheduler
from ignite.handlers import ModelCheckpoint
from torch.cuda import amp       # apex.amp is deprecated. it cannot be regenerated.

from snow_model import Net
# from snow_model_module import Net
# from snow_model_2 import Net

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device used : {device}")

# from cupy_dataset import OptionDataSet
timer = Timer(average=True)
model = Net().cuda()
loss_fn = MSELoss()
optimizer = Adam(model.parameters(), lr=1e-3)

set_amp = True
scaler = amp.GradScaler(enabled=set_amp)

# dataset = OptionDataSet(max_len=10000, number_path = 1024, batch=4800)
# dataset = SnowballDataSet(max_len = 50000, number_path = 500000, batch = 4, threads = 512, seed  =1999 )
# dataset = datasets.DatasetFolder(
#     root=path,
#     loader=npy_loader,
#     extensions=['.pt']
# )
# dataset = atDataset()
dataset = DataLoader(tdataset, 100)

# dataset size is 10,000
def train_update(engine, batch):
    with torch.autocast(device_type='cuda', dtype=torch.float16): ########### automatic mixed precision
        model.train()
        optimizer.zero_grad()
        x = batch[0]
        y = batch[1]
        y_pred = model(x)
        loss = loss_fn(y_pred[:,0], y)
        assert y_pred.dtype is torch.float16 ##################
        assert loss.dtype is torch.float32 ##################
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    # loss.backward()
    # optimizer.step()
    optimizer.zero_grad() # set_to_none=True here can modestly improve performance  WHAET DOES THIS DO
    return loss.item()

trainer = Engine(train_update)
log_interval = 1000

scheduler = CosineAnnealingScheduler(optimizer, 'lr', 1e-4, 1e-6, len(dataset))
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
timer.attach(trainer,
             start=Events.EPOCH_STARTED,
             resume=Events.ITERATION_STARTED,
             pause=Events.ITERATION_COMPLETED,
             step=Events.ITERATION_COMPLETED)    
@trainer.on(Events.ITERATION_COMPLETED)
def log_training_loss(engine):
    iter = (engine.state.iteration - 1) % len(dataset) + 1
    if iter % log_interval == 0:
        print('loss', engine.state.output, 'average time', timer.value())

# @trainer.on(Events.GET_BATCH_STARTED)
# def log_training_loss(engine):
#     print("EPOCH!!!!!!!!!!!!\n")
        
# trainer.run(dataset, max_epochs=100)
trainer.run(dataset, max_epochs=100)

Device used : cuda:0
loss 0.8380540013313293 average time 0.002695952901325654
loss 0.22348438203334808 average time 0.0024961021010240073
loss 0.70233154296875 average time 0.002430481867166236
loss 0.3667253851890564 average time 0.002397824225219665
loss 0.5217218399047852 average time 0.0023931232399656437
loss 0.19837328791618347 average time 0.0023958347500301898
loss 0.22395247220993042 average time 0.0024080648715192055
loss 0.2895205616950989 average time 0.0024176712874686927
loss 0.23500633239746094 average time 0.0024345940334153256
loss 0.9038403034210205 average time 0.0025410603992058896
loss 0.16764132678508759 average time 0.00249455269947066
loss 0.32438400387763977 average time 0.0025345728329848496
loss 0.21896147727966309 average time 0.002564192599413218
loss 0.33764052391052246 average time 0.00257362581982743
loss 0.14903752505779266 average time 0.0025618552666952987
loss 0.16698071360588074 average time 0.0025267774428024757
loss 0.16793979704380035 average ti

State:
	iteration: 999400
	epoch: 100
	epoch_length: 9994
	max_epochs: 100
	output: 0.0018181210616603494
	batch: <class 'list'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch

hidden = 1024

model_o = nn.Sequential(
    nn.Linear(7, hidden),
    F.elu(),
    nn.Linear(hidden, hidden),
    F.elu(),
    nn.Linear(hidden, hidden),
    F.elu(),
    nn.Linear(hidden, hidden),
    F.elu(),
    nn.Linear(hidden, hidden),
    F.elu(),
    nn.Linear(hidden, 1),
    F.elu(),
)

In [None]:
def make_model(in_size, out_size, hidden,  num_layers):
    layers = []
    layers.append(torch.nn.Linear(in_size, hidden))
    layers.append(torch.nn.functional.elu())
    for _ in range(num_layers - 1):
        layers.append(torch.nn.Linear(hidden, hidden))
        layers.append(torch.nn.functional.elu())
    layers.append(torch.nn.Linear(hidden, out_size))
    return torch.nn.Sequential(*tuple(layers)).cuda()

### Nah, id lose, and lose bad
cudf is not supported on windows :/ 

<img src = pics\galaxy-angel-mint-blancmanche.gif>

In [36]:
import dask
dask.config.set({"dataframe.backend": "cudf"})
import dask_cudf
from dask.delayed import delayed
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()
from dask.distributed import Client
client = Client(cluster)
client

ModuleNotFoundError: No module named 'dask_cudf'