In [None]:
'''
This notebook takes the large *.tgz file created from the batch jobs notebook, as well as the generated inputs from the MatrixBuilder notebook
and creates tensor files to be imported to train and run the model. 

You should make sure you're importing the correct size matrix from the matrices directory, and set your exp (experiment name) to match the one from your batch jobs notebook.
'''

import os
import csv
import tarfile
import pandas as pd


# imports the pre-generated N input matrix into a python list
# make sure this is the correct size for the number of jobs you want to run!
import matrices.LHS_1000

# grab the input matrix, again make sure this is correct!!
matrix = matrices.LHS_1000.matrix

In [None]:
matrix_size = len(matrix)
exp = 'soybeans'  # Be sure to set this correctly for your generated outputs from the batch job notebook!!!

In [None]:
# how we want to split up the inputs and outputs. Currently set to 80%, 10%, 10%
splits = [int(matrix_size * .8), int(matrix_size * .9)]
splits

In [None]:
# where's our tgz file from the previous notebook?
gz_file = 'outputs/{}_{}.tgz'.format(exp, matrix_size)
gz_file

In [None]:
tar = tarfile.open(gz_file, "r:*")
# tar.getmembers()

In [None]:
%%time

# read in all of the csv files into separate pandas dataframes and store those in a list
# this can take ~40 minutes when there's 10k of them

csv_files = tar.getmembers()[1:]  # skips the first one because it's a directory not a file
csv_files = sorted(csv_files, key=lambda m: m.name)  # they come out in a very odd order, want them 0-9 sorted
print(len(csv_files))
dfs = []
for csv in csv_files:
    df = pd.read_csv(tar.extractfile(csv))
    dfs.append(df)

In [None]:
print(len(dfs))

In [None]:
%%time

# convert list to numpy array of type float32

import torch
import numpy as np

np_inputs = np.array(matrix, dtype="float32")
np_inputs

In [None]:
%%time

# split the numpy input array into 3 chunks (training, test, validate)
[in_train_np, in_validate_np, in_test_np] = np.array_split(np_inputs, splits)
print('train: {}, val: {}, test: {}'.format(len(in_train_np), len(in_validate_np), len(in_test_np)))

In [None]:
%%time

# grab just the columns we want. And right now we're just grabbing the last value of each of them

np_outs_list = []
for df in dfs:
    np_outs_list.append([df['somtc'].iat[-1], df['somsc'].iat[-1], df['agcprd'].iat[-1], 
                         df['cgrain'].iat[-1], df['stemp'].iat[-1]])
    # print('appending somsc={} and bglivcj={}'.format(df['somsc'].iat[-1], df['bglivcj'].iat[-1]))
len(np_outs_list)

In [None]:
len(np_outs_list)

In [None]:
%%time

# create a Numpy array

np_outs = np.array(np_outs_list, dtype="float32")
np_outs

In [None]:
len(np_outs)

In [None]:
%%time

# make the input tensors
in_train_tensor = torch.from_numpy(in_train_np)
print(in_train_tensor)
in_validate_tensor = torch.from_numpy(in_validate_np)
print(in_train_tensor)
in_test_tensor = torch.from_numpy(in_test_np)
print(in_test_tensor)

In [None]:
# write the input tensors to disk
np.save('outputs/{}_{}_in_train_tensor.npy'.format(exp, matrix_size), in_train_tensor)
np.save('outputs/{}_{}_in_test_tensor.npy'.format(exp, matrix_size), in_test_tensor)
np.save('outputs/{}_{}_in_validate_tensor.npy'.format(exp, matrix_size), in_validate_tensor)
print('done saving input tensor files')

In [None]:
len(np_outs)

In [None]:
# split up the outputs in to the 80-10-10
[out_train_np, out_validate_np, out_test_np] = np.array_split(np_outs, splits)
print('train: {}, val: {}, test: {}'.format(len(out_train_np), len(out_validate_np), len(out_test_np)))

In [None]:
%%time

# make the output tensors
out_train_tensor = torch.from_numpy(out_train_np)
print(out_train_tensor)
out_validate_tensor = torch.from_numpy(out_validate_np)
print(out_train_tensor)
out_test_tensor = torch.from_numpy(out_test_np)
print(out_test_tensor)

In [None]:
# write the output tensors to disk
np.save('outputs/{}_{}_out_train_tensor.npy'.format(exp, matrix_size), out_train_tensor)
np.save('outputs/{}_{}_out_test_tensor.npy'.format(exp, matrix_size), out_test_tensor)
np.save('outputs/{}_{}_out_validate_tensor.npy'.format(exp, matrix_size), out_validate_tensor)
print('done saving output tensor files')

In [None]:
# looking at the tensors
print("\n")
print(f"Shape of in_train_tensor: {in_train_tensor.shape}")
print(f"Datatype of in_train_tensor: {in_train_tensor.dtype}")
print(f"Device in_train_tensor is stored on: {in_train_tensor.device}")
print("\n")
print(f"Shape of in_test_tensor : {in_test_tensor.shape}")
print(f"Datatype of in_test_tensor: {in_test_tensor.dtype}")
print(f"Device in_test_tensor is stored on: {in_test_tensor.device}")
print("\n")
print(f"Shape of in_validate_tensor: {in_validate_tensor.shape}")
print(f"Datatype of in_validate_tensor: {in_validate_tensor.dtype}")
print(f"Device in_validate_tensor is stored on: {in_validate_tensor.device}")

In [None]:
# looking at the tensors
print("\n")
print(f"Shape of out_train_tensor: {out_train_tensor.shape}")
print(f"Datatype of out_train_tensor: {out_train_tensor.dtype}")
print(f"Device tensor is stored on: {out_train_tensor.device}")
print("\n")
print(f"Shape of out_test_tensor : {out_test_tensor.shape}")
print(f"Datatype of out_test_tensor: {out_test_tensor.dtype}")
print(f"Device out_test_tensor is stored on: {out_test_tensor.device}")
print("\n")
print(f"Shape of out_validate_tensor: {out_validate_tensor.shape}")
print(f"Datatype of out_validate_tensor: {out_validate_tensor.dtype}")
print(f"Device out_validate_tensor is stored on: {out_validate_tensor.device}")

In [None]:
'''
to reload them from disk kinda looks like this, but I'm not going to do that.

out_np = np.load('outputs/1k_outputs.npy')
out_tensor = torch.from_numpy(out_np)
out_tensor
'''
