In [1]:
'''
This notebook takes the large *.tgz file created from the batch jobs notebook, as well as the generated inputs from the MatrixBuilder notebook
and creates tensor files to be imported to train and run the model. 

You should make sure you're importing the correct size matrix from the matrices directory, and set your exp (experiment name) to match the one from your batch jobs notebook.
'''

import os
import csv
import tarfile
import pandas as pd


# imports the pre-generated N input matrix into a python list
# make sure this is the correct size for the number of jobs you want to run!
import matrices.LHS_1000

# grab the input matrix, again make sure this is correct!!
matrix = matrices.LHS_1000.matrix

In [2]:
matrix_size = len(matrix)
exp = 'soybeans'  # Be sure to set this correctly for your generated outputs from the batch job notebook!!!

In [3]:
# how we want to split up the inputs and outputs. Currently set to 80%, 10%, 10%
splits = [int(matrix_size * .8), int(matrix_size * .9)]
splits

[800, 900]

In [4]:
# where's our tgz file from the previous notebook?
gz_file = 'outputs/{}_{}.tgz'.format(exp, matrix_size)
gz_file

'outputs/soybeans_1000.tgz'

In [5]:
tar = tarfile.open(gz_file, "r:*")
# tar.getmembers()

In [6]:
%%time

# read in all of the csv files into separate pandas dataframes and store those in a list
# this can take ~40 minutes when there's 10k of them

csv_files = tar.getmembers()[1:]  # skips the first one because it's a directory not a file
csv_files = sorted(csv_files, key=lambda m: m.name)  # they come out in a very odd order, want them 0-9 sorted
print(len(csv_files))
dfs = []
for csv in csv_files:
    df = pd.read_csv(tar.extractfile(csv))
    dfs.append(df)

1000
CPU times: user 15.5 s, sys: 120 ms, total: 15.6 s
Wall time: 15.6 s


In [7]:
print(len(dfs))

1000


In [8]:
%%time

# convert list to numpy array of type float32

import torch
import numpy as np

np_inputs = np.array(matrix, dtype="float32")
np_inputs

CPU times: user 352 ms, sys: 50.6 ms, total: 403 ms
Wall time: 429 ms


array([[1.9023800e+00, 2.9023800e+00, 2.9023800e+00, ..., 8.5860896e-01,
        3.2329133e-01, 2.7900000e+02],
       [2.0051787e+00, 3.0051787e+00, 3.0051787e+00, ..., 4.4138268e-01,
        8.1720603e-01, 2.7900000e+02],
       [1.9429902e+00, 2.9429901e+00, 2.9429901e+00, ..., 9.2658317e-01,
        3.8099919e-02, 2.7700000e+02],
       ...,
       [2.4215987e+00, 3.4215987e+00, 3.4215987e+00, ..., 9.0290111e-01,
        9.2163301e-01, 2.8700000e+02],
       [8.7747645e-01, 1.8774765e+00, 1.8774765e+00, ..., 1.9170301e-01,
        5.5725002e-01, 2.8500000e+02],
       [1.7519276e+00, 2.7519276e+00, 2.7519276e+00, ..., 4.0958419e-01,
        4.2733118e-01, 2.8800000e+02]], dtype=float32)

In [9]:
%%time

# split the numpy input array into 3 chunks (training, test, validate)
[in_train_np, in_validate_np, in_test_np] = np.array_split(np_inputs, splits)
print('train: {}, val: {}, test: {}'.format(len(in_train_np), len(in_validate_np), len(in_test_np)))

train: 800, val: 100, test: 100
CPU times: user 146 µs, sys: 0 ns, total: 146 µs
Wall time: 126 µs


In [10]:
%%time

# grab just the columns we want. And right now we're just grabbing the last value of each of them

np_outs_list = []
for df in dfs:
    np_outs_list.append([df['somtc'].iat[-1], df['somsc'].iat[-1], df['agcprd'].iat[-1], 
                         df['cgrain'].iat[-1], df['stemp'].iat[-1]])
    # print('appending somsc={} and bglivcj={}'.format(df['somsc'].iat[-1], df['bglivcj'].iat[-1]))
len(np_outs_list)

CPU times: user 190 ms, sys: 21.7 ms, total: 212 ms
Wall time: 197 ms


1000

In [11]:
len(np_outs_list)

1000

In [12]:
%%time

# create a Numpy array

np_outs = np.array(np_outs_list, dtype="float32")
np_outs

CPU times: user 732 µs, sys: 235 µs, total: 967 µs
Wall time: 975 µs


array([[2.4281787e+03, 2.3865642e+03, 1.2176840e+02, 2.1933000e+00,
        7.8960001e-01],
       [2.5635049e+03, 2.5088774e+03, 1.2374220e+02, 2.1006999e+00,
        3.6280000e-01],
       [2.6459824e+03, 2.5843379e+03, 1.2507270e+02, 1.8857000e+00,
        3.0590001e-01],
       ...,
       [2.4322808e+03, 2.3901003e+03, 1.0937180e+02, 1.7424999e+00,
        6.8769997e-01],
       [2.6858965e+03, 2.6305942e+03, 1.2220000e+02, 2.1262000e+00,
        3.1680000e-01],
       [2.6639910e+03, 2.6018770e+03, 1.2774200e+02, 1.9584000e+00,
        3.9309999e-01]], dtype=float32)

In [13]:
len(np_outs)

1000

In [14]:
%%time

# make the input tensors
in_train_tensor = torch.from_numpy(in_train_np)
print(in_train_tensor)
in_validate_tensor = torch.from_numpy(in_validate_np)
print(in_train_tensor)
in_test_tensor = torch.from_numpy(in_test_np)
print(in_test_tensor)

tensor([[1.9024e+00, 2.9024e+00, 2.9024e+00,  ..., 8.5861e-01, 3.2329e-01,
         2.7900e+02],
        [2.0052e+00, 3.0052e+00, 3.0052e+00,  ..., 4.4138e-01, 8.1721e-01,
         2.7900e+02],
        [1.9430e+00, 2.9430e+00, 2.9430e+00,  ..., 9.2658e-01, 3.8100e-02,
         2.7700e+02],
        ...,
        [1.3717e+00, 2.3717e+00, 2.3717e+00,  ..., 1.5540e-01, 1.1963e-01,
         2.8500e+02],
        [1.9996e+00, 2.9996e+00, 2.9996e+00,  ..., 1.3988e-01, 6.2921e-01,
         2.8800e+02],
        [2.1921e+00, 3.1921e+00, 3.1921e+00,  ..., 8.8265e-01, 2.5042e-01,
         2.7500e+02]])
tensor([[1.9024e+00, 2.9024e+00, 2.9024e+00,  ..., 8.5861e-01, 3.2329e-01,
         2.7900e+02],
        [2.0052e+00, 3.0052e+00, 3.0052e+00,  ..., 4.4138e-01, 8.1721e-01,
         2.7900e+02],
        [1.9430e+00, 2.9430e+00, 2.9430e+00,  ..., 9.2658e-01, 3.8100e-02,
         2.7700e+02],
        ...,
        [1.3717e+00, 2.3717e+00, 2.3717e+00,  ..., 1.5540e-01, 1.1963e-01,
         2.8500e+02],
   

In [15]:
# write the input tensors to disk
np.save('outputs/{}_{}_in_train_tensor.npy'.format(exp, matrix_size), in_train_tensor)
np.save('outputs/{}_{}_in_test_tensor.npy'.format(exp, matrix_size), in_test_tensor)
np.save('outputs/{}_{}_in_validate_tensor.npy'.format(exp, matrix_size), in_validate_tensor)
print('done saving input tensor files')

done saving input tensor files


In [16]:
len(np_outs)

1000

In [17]:
# split up the outputs in to the 80-10-10
[out_train_np, out_validate_np, out_test_np] = np.array_split(np_outs, splits)
print('train: {}, val: {}, test: {}'.format(len(out_train_np), len(out_validate_np), len(out_test_np)))

train: 800, val: 100, test: 100


In [18]:
%%time

# make the output tensors
out_train_tensor = torch.from_numpy(out_train_np)
print(out_train_tensor)
out_validate_tensor = torch.from_numpy(out_validate_np)
print(out_train_tensor)
out_test_tensor = torch.from_numpy(out_test_np)
print(out_test_tensor)

tensor([[2.4282e+03, 2.3866e+03, 1.2177e+02, 2.1933e+00, 7.8960e-01],
        [2.5635e+03, 2.5089e+03, 1.2374e+02, 2.1007e+00, 3.6280e-01],
        [2.6460e+03, 2.5843e+03, 1.2507e+02, 1.8857e+00, 3.0590e-01],
        ...,
        [2.4836e+03, 2.4405e+03, 1.2004e+02, 2.1476e+00, 6.9500e-01],
        [2.5095e+03, 2.4627e+03, 1.2229e+02, 2.1463e+00, 6.3120e-01],
        [2.5432e+03, 2.4918e+03, 1.2412e+02, 1.9426e+00, 5.6630e-01]])
tensor([[2.4282e+03, 2.3866e+03, 1.2177e+02, 2.1933e+00, 7.8960e-01],
        [2.5635e+03, 2.5089e+03, 1.2374e+02, 2.1007e+00, 3.6280e-01],
        [2.6460e+03, 2.5843e+03, 1.2507e+02, 1.8857e+00, 3.0590e-01],
        ...,
        [2.4836e+03, 2.4405e+03, 1.2004e+02, 2.1476e+00, 6.9500e-01],
        [2.5095e+03, 2.4627e+03, 1.2229e+02, 2.1463e+00, 6.3120e-01],
        [2.5432e+03, 2.4918e+03, 1.2412e+02, 1.9426e+00, 5.6630e-01]])
tensor([[2.6313e+03, 2.5769e+03, 1.2280e+02, 2.0934e+00, 3.3100e-01],
        [2.5720e+03, 2.5226e+03, 1.2054e+02, 2.1374e+00, 6.012

In [19]:
# write the output tensors to disk
np.save('outputs/{}_{}_out_train_tensor.npy'.format(exp, matrix_size), out_train_tensor)
np.save('outputs/{}_{}_out_test_tensor.npy'.format(exp, matrix_size), out_test_tensor)
np.save('outputs/{}_{}_out_validate_tensor.npy'.format(exp, matrix_size), out_validate_tensor)
print('done saving output tensor files')

done saving output tensor files


In [20]:
# looking at the tensors
print("\n")
print(f"Shape of in_train_tensor: {in_train_tensor.shape}")
print(f"Datatype of in_train_tensor: {in_train_tensor.dtype}")
print(f"Device in_train_tensor is stored on: {in_train_tensor.device}")
print("\n")
print(f"Shape of in_test_tensor : {in_test_tensor.shape}")
print(f"Datatype of in_test_tensor: {in_test_tensor.dtype}")
print(f"Device in_test_tensor is stored on: {in_test_tensor.device}")
print("\n")
print(f"Shape of in_validate_tensor: {in_validate_tensor.shape}")
print(f"Datatype of in_validate_tensor: {in_validate_tensor.dtype}")
print(f"Device in_validate_tensor is stored on: {in_validate_tensor.device}")



Shape of in_train_tensor: torch.Size([800, 15])
Datatype of in_train_tensor: torch.float32
Device in_train_tensor is stored on: cpu


Shape of in_test_tensor : torch.Size([100, 15])
Datatype of in_test_tensor: torch.float32
Device in_test_tensor is stored on: cpu


Shape of in_validate_tensor: torch.Size([100, 15])
Datatype of in_validate_tensor: torch.float32
Device in_validate_tensor is stored on: cpu


In [21]:
# looking at the tensors
print("\n")
print(f"Shape of out_train_tensor: {out_train_tensor.shape}")
print(f"Datatype of out_train_tensor: {out_train_tensor.dtype}")
print(f"Device tensor is stored on: {out_train_tensor.device}")
print("\n")
print(f"Shape of out_test_tensor : {out_test_tensor.shape}")
print(f"Datatype of out_test_tensor: {out_test_tensor.dtype}")
print(f"Device out_test_tensor is stored on: {out_test_tensor.device}")
print("\n")
print(f"Shape of out_validate_tensor: {out_validate_tensor.shape}")
print(f"Datatype of out_validate_tensor: {out_validate_tensor.dtype}")
print(f"Device out_validate_tensor is stored on: {out_validate_tensor.device}")



Shape of out_train_tensor: torch.Size([800, 5])
Datatype of out_train_tensor: torch.float32
Device tensor is stored on: cpu


Shape of out_test_tensor : torch.Size([100, 5])
Datatype of out_test_tensor: torch.float32
Device out_test_tensor is stored on: cpu


Shape of out_validate_tensor: torch.Size([100, 5])
Datatype of out_validate_tensor: torch.float32
Device out_validate_tensor is stored on: cpu


In [22]:
'''
to reload them from disk kinda looks like this, but I'm not going to do that.

out_np = np.load('outputs/1k_outputs.npy')
out_tensor = torch.from_numpy(out_np)
out_tensor
'''


"\nto reload them from disk kinda looks like this, but I'm not going to do that.\n\nout_np = np.load('outputs/1k_outputs.npy')\nout_tensor = torch.from_numpy(out_np)\nout_tensor\n"