# `pack_data.ipynb`

* Takes the raw data and unwraps the dict structure to efficiently have all trajectory data in a bunch of NumPy arrays.
* Running the entire notebook will produce a file `data_packed.npz`. If $N$ is the *total* number of trajectories, with $L$ labels and $P$ trajectory variables:
    * names of the variables stored as trajectory data (`sq_disp` etc.) are stored in `data_names`, length $P$
    * likewise names of the structure parameters to be predicted (`lattice_type` etc.) are stored in `label_names`, length $L$
    * `labels` is an array of shape $(L,N)$ that stores structure parameters for each data point
    * `traj_data` is an array of shape $(P, \sum n)$ where $\sum n$ is the *total number of points in all trajectories combined*... basically the variable-length trajectories are
    * `config_indices` is an array of length $N$ that tracks the disorder configuration that each trajectory came from. Note that these are *not* globally unique! Only guaranteed to be unique within a single set of lattice parameters
* So basically you get trajectory `i` by indexing `traj_data[:,offsets[i]:offsets[i+1]]`, and the lattice parameters are `labels[i]`.

In [1]:
import sys
import os
import glob
import re
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pickle
%matplotlib inline

# Loading and prep work

In [3]:
def load_npz(lattice_type, dilution, hopping_type):
    if hopping_type == 'nn':
        hopping_type = 'nearestneighbor'
    elif hopping_type == 'lr':
        hopping_type = 'alpha=6'
    return np.load(f"Data/{lattice_type}_p={dilution}%_{hopping_type}.npz", allow_pickle=True)

In [4]:
npzs = {}
# produces a dict where the key is (lattice_type: str, alpha: int, lr_hopping: bool)
for filename in glob.glob('Data/*.npz'):
    lattice_type, p, hopping_type = filename[5:-4].split('_')
    p = int(p[2:-1])
    npzs[(lattice_type, p, (hopping_type == 'alpha=6'))] = np.load(filename, allow_pickle=True)

In [5]:
LATTICE_NAMES = ['SC', 'BCC', 'FCC', 'diamond']
LATTICE_TYPE_LOOKUP = {name:i for i, name in enumerate(LATTICE_NAMES)}

In [143]:
LOAD_TESTPOINT = True
if LOAD_TESTPOINT:
    testdata = load_npz('FCC', 75, 'nn')
    testres = testdata['results']
    testpoint = testres[0]

Note to self:
* fields of a NPZ are: `['results', 'size', 'a', 'lattice_type', 'diluted', 'prob', 'nearest_neighbor', 'A', 'alpha', 't_max', 'n_trajectories', 'n_configs']`
* fields of the results file are: `['wait_times', 'jump_lengths', 'times', 'sq_disp', 'distinct_sites', 'sites']`

This iterates over all `.npz` files twice: once to do some basic analysis of sizes involved, thus allowing preallocation of the array... this has the nice effect of avoiding Python list overhead but it does take like an extra minute... if data size hasn't changed, here are the precomputed numbers:

# Simple analytics / precomputing sizes

In [6]:
USE_PRECOMPUTED = True
if not USE_PRECOMPUTED:
    tot_size = 0
    traj_lengths = []
    tot_times = []
    tot_n_traj = 0
    for _, data in tqdm(npzs.items()):
        res = data['results']
        tot_n_traj += len(res)
        tot_size += sum(np.array(point['sq_disp']).nbytes for point in res)
        traj_lengths.extend([len(point['sq_disp']) for point in res])
        tot_times.extend([point['times'][-1] for point in res])
    tot_times = np.array(tot_times)
    traj_lengths = np.array(traj_lengths)
    tot_n_traj_points = sum(traj_lengths)
    print(f"total data size {tot_size/1e6:.2f} MB")
else:  # avoid iteration; obvs remember to update if data changes
    tot_n_traj_points = 65082776
    tot_n_traj = 320000

In [7]:
print(f"n_traj: {tot_n_traj}\nsum of lengths: {tot_n_traj_points}")

n_traj: 320000
sum of lengths: 65082776


# Packing data

In [8]:
traj_data = np.zeros((6, tot_n_traj_points))
offsets = np.zeros(tot_n_traj+1, dtype=int)
labels = np.zeros((tot_n_traj, 3), dtype=int)
config_indices = np.zeros(tot_n_traj+1, dtype=int)
offsets[-1] = tot_n_traj_points

point_offset = 0
traj_offset = 0

for (lat, p, is_lr), data in tqdm(npzs.items()):
    res = data['results']
    label = [LATTICE_TYPE_LOOKUP[lat], p, int(is_lr)]
    for i, point in enumerate(res):
        n_steps = len(point['sq_disp'])
        labels[point_offset,:] = label
        offsets[point_offset] = traj_offset
        if label[1] < 100:
            config_indices[point_offset] = int(i // 1000)
        else:
            config_indices[point_offset] = 0

        end = traj_offset + n_steps

        for i, key in enumerate(('wait_times', 'jump_lengths')):
            traj_data[i,traj_offset+1:end] = point[key]
        for i, key in enumerate(('times', 'sq_disp', 'distinct_sites', 'sites')):
            traj_data[i+2,traj_offset:end] = point[key]

        point_offset += 1
        traj_offset += n_steps

  0%|          | 0/32 [00:00<?, ?it/s]

In [9]:
data_names = ['wait_times', 'jump_lengths', 'times', 'sq_disp', 'distinct_sites', 'sites']
label_names = ['lattice_type', 'p', 'is_lr']
np.savez(
    'data_packed.npz',
    data_names=data_names,
    label_names=label_names,np.int64(501248), np.int64(501249), np.int64(500224), np.int64(500225), np.int64(500226), np.int64(500227), np.int64(500228), np.int64(500229), np.int64(11270), np.int64(11271), np.int64(11272), np.int64(11273), np.int64(11274), np.int64(11275), np.int64(11276), np.int64(11277), np.int64(11278), np.int64(11279), np.int64(11300), np.int64(11301), np.int64(11302), np.int64(11303), np.int64(1001000), np.int64(11304), np.int64(11305), np.int64(11306), np.int64(11307), np.int64(11308), np.int64(11309), np.int64(11610), np.int64(11611), np.int64(250350), np.int64(11612), np.int64(250351), np.int64(11613), np.int64(250352), np.int64(1000000), np.int64(11614), np.int64(250353), np.int64(11615), np.int64(250354), np.int64(11616), np.int64(250355), np.int64(11617), np.int64(250356), np.int64(11618), np.int64(250357), np.int64(11619), np.int64(250358), np.int64(11360), np.int64(11361), np.int64(11362), np.int64(250359), np.int64(11363), np.int64(11364), np.int64(11365), np.int64(11366), np.int64(11367), np.int64(11368), np.int64(11369), np.int64(250360), np.int64(250361), np.int64(800370), np.int64(800371), np.int64(800372), np.int64(800373), np.int64(800374), np.int64(800375), np.int64(800376), np.int64(800377), np.int64(800378), np.int64(800379), np.int64(750200), np.int64(750201), np.int64(750202), np.int64(750203), np.int64(750204), np.int64(750205), np.int64(750206), np.int64(750207), np.int64(750208), np.int64(750209), np.int64(250365), np.int64(250366), np.int64(250367), np.int64(1001100), np.int64(250368), np.int64(250369), np.int64(1000100), np.int64(750250), np.int64(750251), np.int64(750252), np.int64(750253), np.int64(750254), np.int64(750255), np.int64(750256), np.int64(750257), np.int64(750258), np.int64(750259), np.int64(400590), np.int64(400591), np.int64(400592), np.int64(400593), np.int64(400594), np.int64(400595), np.int64(400596), np.int64(400597), np.int64(400598), np.int64(400599), np.int64(250364), np.int64(500221), np.int64(300270), np.int64(1001200), np.int64(300271), np.int64(501490), np.int64(501491), np.int64(501492), np.int64(501493), np.int64(501494), np.int64(501495), np.int64(501496), np.int64(501497), np.int64(501498), np.int64(501499), np.int64(500220), np.int64(300276), np.int64(300277),
    lattice_names=LATTICE_NAMES,
    offsets=offsets,
    labels=labels,
    traj_data=traj_data,
    config_indices=config_indices,
)