In [0]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.utils.data

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

from fastai import basic_train, basic_data
from fastai.callbacks import ActivationStats
import fastai

import tracemalloc

import pickle as pkl

In [0]:
class AE_3D_200(nn.Module):
    """
    An Autoencoder that compresses 4d vectors into 3d vectors. The encode-decode process is in-200-100-50-3-50-100-200-out.
    """

    def __init__(self, n_features=4):
        """
        Initialize the autoencoder network, and define the layers.
        input:
        - n_features:int=4  The dimension of input feature, by default is 4.
        """
        super(AE_3D_200, self).__init__()
        self.en1 = nn.Linear(n_features, 200) # the 1st encoding layer, a fully-connected layer with {n_features}-d input (raw data) and 200d output 
        self.en2 = nn.Linear(200, 100) # the 2nd encoding layer, a fully-connected layer with 200d input and 100d output
        self.en3 = nn.Linear(100, 50) # the 3rd encoding layer, a fully-connected layer with 100d input and 50d output
        self.en4 = nn.Linear(50, 3) # the last encoding layer, a fully-connected layer with 50d input and 3d output as encoded data

        self.de1 = nn.Linear(3, 50) # the 1st decoding layer, a fully-connected layer with 3d input (encoded data) and 50d output 
        self.de2 = nn.Linear(50, 100) # the 2nd decoding layer, a fully-connected layer with 50d input and 100d output 
        self.de3 = nn.Linear(100, 200) # the 3rd decoding layer, a fully-connected layer with 100d input and 200d output 
        self.de4 = nn.Linear(200, n_features) # the last decoding layer, a fully-connected layer with 200d input and {n_features}-d output (decoded data)
        self.tanh = nn.Tanh() # the activation funcition, with a range in (-1, 1)

    def encode(self, x):
        """
        Encode raw data into encoded ones.
        input:
        - x:torch.tensor(dtype=torch.float32) The raw data that need to be encode
        output:
        - out:torch.tensor(dtype=torch.float32) The encoded data
        """
        x = self.tanh(self.en1(x)) # encode: 4d => 200d 
        x = self.tanh(self.en2(x)) # encode: 200d => 100d
        x = self.tanh(self.en3(x)) # encode: 100d => 50d
        out = self.en4(x) # encode: 50d => 3d
        return out

    def decode(self, x):
        """
        Decode data into decoded ones.
        input:
        - x:torch.tensor(dtype=torch.float32) The raw data that need to be decode
        output:
        - out:torch.tensor(dtype=torch.float32) The decoded data
        """
        x = self.de1(self.tanh(x)) # deocde: 3d => 50d
        x = self.de2(self.tanh(x)) # deocde: 50d => 100d
        x = self.de3(self.tanh(x)) # deocde: 100d => 200d
        out = self.de4(self.tanh(x)) # deocde: 3d => 50d
        return out

    def forward(self, x):
        """
        Feed forward process of training.
        input:
        - x:torch.tensor(dtype=torch.float32) The raw data
        output:
        - out:torch.tensor(dtype=torch.float32) The decoded data
        """
        z = self.encode(x)
        return self.decode(z)

In [0]:
def get_data(train_ds, valid_ds, bs):
    """
    Dataloader wrapper of dataset.
    input:
    - train_ds:torch.tensor(dtype=torch.float32) The train dataset
    - valid_ds:torch.tensor(dtype=torch.float32) The test dataset
    - bs:int The size of batch
    output:
    - out:turple A turple of train shuffled dataloader with a batchsize in bs, and test dataloader with a batchsize in 2*bs
    """
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs * 2),
    )

model = AE_3D_200() # Instantiate the model implemented above

loss_func = nn.MSELoss() # Use mean squared error (squared L2 norm) as loss function

bn_wd = False  # Don't use weight decay for batchnorm layers
true_wd = True  # weight decay will be used for all optimizers
wd = 1e-6 # set the value of weight decay

In [0]:
tracemalloc.start() # start tracing the memory usage in loading data

# load train and test dataset from pkl files
with open('/content/drive/My Drive/all_jets_test_4D_100_percent.pkl', 'rb') as file:
    test = pd.DataFrame(pkl.load(file))

with open('/content/drive/My Drive/all_jets_train_4D_100_percent.pkl', 'rb') as file:
    train = pd.DataFrame(pkl.load(file))

n_features = len(train.loc[0]) # get feature number (here is 4)

# normalize the train and test dataset to standard one (std=1)
train_mean = train.mean()
train_std = train.std()
train = (train - train_mean) / train_std
test = (test - train_mean) / train_std

# taking snapshot of current memory usage and stop tracking
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
tracemalloc.stop()

# print the biggest five memory blocks
stat = top_stats[0:5]
for s in stat:
    print(str(s.traceback) + " Size: "+ str(s.size) + " bytes")

/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py:1874 Size: 4471296 bytes
/usr/lib/python3.6/linecache.py:137 Size: 1152798 bytes
<ipython-input-18-0fa5be27fe79>:7 Size: 895133 bytes
<ipython-input-18-0fa5be27fe79>:4 Size: 224236 bytes
/usr/lib/python3.6/posixpath.py:372 Size: 221563 bytes


In [0]:
# The input data and the ground true of encode-deocded data is the same (raw data),
# so the x (input data) and y (ground) are the copy of raw data.
train_x = train
test_x = test
train_y = train_x
test_y = test_x

# Construct dataset, the data type should be declare explicitly as torch.float (torch.float32).
# The original code does not declare, so the data
# will be converted to torch.double (torch.float64) and lead to datatype error when encoding.
#train_ds = TensorDataset(torch.tensor(train_x.values), torch.tensor(train_y.values)) 
#valid_ds = TensorDataset(torch.tensor(test_x.values), torch.tensor(test_y.values))
train_ds = TensorDataset(torch.tensor(train_x.values, dtype=torch.float), torch.tensor(train_y.values, dtype=torch.float))
valid_ds = TensorDataset(torch.tensor(test_x.values, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.float))

# Get dataloader
train_dl, valid_dl = get_data(train_ds, valid_ds, bs=256)

# Bind train_dl and test_dl in a data object.
db = basic_data.DataBunch(train_dl, valid_dl)

# Define the learner in Fast.ai, with a record of the mean and std of activation func (enabled by ActivationStats)
learn = basic_train.Learner(data=db, model=model, loss_func=loss_func, wd=wd, callback_fns=ActivationStats, bn_wd=bn_wd, true_wd=true_wd)

# Choose and load the trained network
learn.load('/content/drive/My Drive/AE_3D_200_no1cycle_trainforever')

# Load the model to CPU
model.to('cpu')

AE_3D_200(
  (en1): Linear(in_features=4, out_features=200, bias=True)
  (en2): Linear(in_features=200, out_features=100, bias=True)
  (en3): Linear(in_features=100, out_features=50, bias=True)
  (en4): Linear(in_features=50, out_features=3, bias=True)
  (de1): Linear(in_features=3, out_features=50, bias=True)
  (de2): Linear(in_features=50, out_features=100, bias=True)
  (de3): Linear(in_features=100, out_features=200, bias=True)
  (de4): Linear(in_features=200, out_features=4, bias=True)
  (tanh): Tanh()
)

In [0]:
# print the number of test data
number_of_events = torch.tensor(test.values).size()[0]
print("Number of events: " + str(number_of_events))

tracemalloc.start() # start tracking the memory usage of encoding

# encode test data
compressed = learn.model.encode(torch.tensor(test.values, dtype=torch.float)).detach().numpy()

# taking snapshot of current memory usage and stop tracking
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('traceback')

# pick the biggest memory block, as can be seen below, encoded data use less memory
stat = top_stats[0]
print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
for line in stat.traceback.format():
    print(line)

Number of events: 27945
271 memory blocks: 14.1 KiB
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/compilerop.py", line 100
    return compile(source, filename, symbol, self.flags | PyCF_ONLY_AST, 1)
