In [2]:
!git clone https://github.com/google-deepmind/neural_networks_chomsky_hierarchy.git

fatal: destination path 'neural_networks_chomsky_hierarchy' already exists and is not an empty directory.


In [3]:
!pip install import-ipynb



In [4]:
%cd /content/drive/My Drive/BA Thesis/Code_notebooks
from neural_networks_chomsky_hierarchy.tasks import task

# loading all data generation functions
from neural_networks_chomsky_hierarchy.tasks.cs.binary_addition import BinaryAddition
from neural_networks_chomsky_hierarchy.tasks.cs.binary_multiplication import BinaryMultiplication
from neural_networks_chomsky_hierarchy.tasks.cs.bucket_sort import BucketSort
from neural_networks_chomsky_hierarchy.tasks.cs.compute_sqrt import ComputeSqrt
from neural_networks_chomsky_hierarchy.tasks.cs.duplicate_string import DuplicateString
from neural_networks_chomsky_hierarchy.tasks.cs.missing_duplicate_string import MissingDuplicateString
from neural_networks_chomsky_hierarchy.tasks.cs.odds_first import OddsFirst

from neural_networks_chomsky_hierarchy.tasks.dcf.modular_arithmetic_brackets import ModularArithmeticBrackets
from neural_networks_chomsky_hierarchy.tasks.dcf.reverse_string import ReverseString
from neural_networks_chomsky_hierarchy.tasks.dcf.solve_equation import SolveEquation
from neural_networks_chomsky_hierarchy.tasks.dcf.stack_manipulation import StackManipulation

from neural_networks_chomsky_hierarchy.tasks.regular.cycle_navigation import CycleNavigation
from neural_networks_chomsky_hierarchy.tasks.regular.even_pairs import EvenPairs
from neural_networks_chomsky_hierarchy.tasks.regular.modular_arithmetic import ModularArithmetic
from neural_networks_chomsky_hierarchy.tasks.regular.parity_check import ParityCheck

from IPython.display import display, clear_output
import numpy as np
import utils
import chex
import jax
import os
import sys
import shutil
import pickle as pkl
from tqdm.notebook import tqdm
from concurrent.futures import ProcessPoolExecutor

/content/drive/My Drive/BA Thesis/Code_notebooks


In [5]:
%cd /
%cd content/neural_networks_chomsky_hierarchy

/
/content/neural_networks_chomsky_hierarchy


In [6]:
pip install -r requirements.txt

Collecting git+https://github.com/deepmind/einshape (from -r requirements.txt (line 4))
  Cloning https://github.com/deepmind/einshape to /tmp/pip-req-build-wr37gog1
  Running command git clone --filter=blob:none --quiet https://github.com/deepmind/einshape /tmp/pip-req-build-wr37gog1
  Resolved https://github.com/deepmind/einshape to commit d91ab136da3d97f4ba2b4582531cdc2dfd9fab0a
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [7]:
pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

Looking in links: https://storage.googleapis.com/jax-releases/jax_cuda_releases.html


In [8]:
def generate_data(test_type: str, test_name: list, ttv: str, num_batches: int, gen_args: dict, del_prev: bool):
    """
    Generates the data according to the following arguments:
    test_type: type of test (cs, dcf, regular)
    test_name: list of classes according to which the sample batches are generated
    ttv: if it should be saved in training, validation or test file
    """
    # Ensure the base data directory exists
    data_dir = os.path.join(origin, "data")
    os.makedirs(data_dir, exist_ok=True)

    # Set up the base directory for this run
    ttv_dir = os.path.join(data_dir, ttv)
    os.makedirs(ttv_dir, exist_ok=True)

    test_type_dir = os.path.join(ttv_dir, test_type)
    os.makedirs(test_type_dir, exist_ok=True)

    total_storage_used = 0

    for task_class in test_name:
        generator_class = task_class()
        class_dir = os.path.join(test_type_dir, task_class.__name__)

        # Clear and recreate the directory if necessary
        if os.path.exists(class_dir) and del_prev:
            shutil.rmtree(class_dir)
        os.makedirs(class_dir, exist_ok=True)

        for i in range(num_batches):
            batch = generator_class.sample_batch(**gen_args)
            batch["input"] = np.array(batch["input"])
            batch["output"] = np.array(batch["output"])

            # Apply necessary padding
            if task_class.__name__ in require_padding:
                batch["input"], batch["output"] = utils.pad(batch["input"], batch["output"])

            # pad the y of StackManipulation such that the model can be trained
            if task_class.__name__ == "StackManipulation":
                p = np.zeros((len(batch["output"][0]), 2), dtype=np.float32)
                zeros_expanded = np.expand_dims(p, axis=0)
                zeros_tiled = np.tile(zeros_expanded, (gen_args["batch_size"], 1, 1))
                batch["output"] = np.concatenate((batch["output"], zeros_tiled), axis=2)

            # Create batch name
            name = f"{task_class.__name__}_{ttv}_{i}_{gen_args['length']}"

            # Track storage usage
            storage_used = sys.getsizeof(batch["input"]) + sys.getsizeof(batch["output"])
            total_storage_used += storage_used

            with open(os.path.join(class_dir, f"{name}.pkl"), "wb") as file:
                pkl.dump(batch, file)

    os.chdir(origin)
    return total_storage_used


In [9]:
# some useful lists to loop through
cs_names = [BinaryAddition, BinaryMultiplication, BucketSort, ComputeSqrt, DuplicateString, MissingDuplicateString, OddsFirst]
dcf_names = [ModularArithmeticBrackets, ReverseString, SolveEquation, StackManipulation]
regular_names = [CycleNavigation, EvenPairs, ModularArithmetic, ParityCheck]
require_padding = ["BinaryAddition", "StackManipulation", "ComputeSqrt", "DuplicateString"]

In [10]:
origin = "/content/drive/MyDrive/BA Thesis"
os.chdir(origin)
print(os.getcwd())

/content/drive/MyDrive/BA Thesis


In [10]:
# Generate Full Training Dataset
# WILL TAKE VERY LONG!!!


# making sure samples of various lengths are equally distributed
max_length_train = 40
min_length_train = 1
train_length_interval = max_length_train - min_length_train
current_length = min_length_train

rng_seed = 42

batches_per_problem = 100
train_batch_size = 32

# to first clear all data to have clean start
del_prev = True

# track used storage
total_storage_used = 0

for i in tqdm(range(min_length_train, max_length_train + 1)):

  # generating the data
  training_args = {"rng": jax.random.PRNGKey(rng_seed), "batch_size": train_batch_size, "length": current_length}
  storage_used_cs = generate_data("cs", cs_names, "training", batches_per_problem, training_args, del_prev)
  storage_used_dcf = generate_data("dcf", dcf_names, "training", batches_per_problem, training_args, del_prev)
  storage_used_regular = generate_data("regular", regular_names, "training", batches_per_problem, training_args, del_prev)
  total_storage_used += storage_used_cs + storage_used_dcf + storage_used_regular

  # so we dont delete the data we actually want to preserve
  del_prev = False

  # print storage used to keep track of data mass
  print("Storage used: ")
  print("total: ", utils.readable(total_storage_used))
  print("Data of length ", current_length, " created successfully!")

  # increasing the length of the data samples to create variety in the data
  current_length += 1



  0%|          | 0/40 [00:00<?, ?it/s]

Storage used: 
total:  2.06 MB
Data of length  1  created successfully!
Storage used: 
total:  5.16 MB
Data of length  2  created successfully!
Storage used: 
total:  9.52 MB
Data of length  3  created successfully!
Storage used: 
total:  14.91 MB
Data of length  4  created successfully!
Storage used: 
total:  21.56 MB
Data of length  5  created successfully!
Storage used: 
total:  29.25 MB
Data of length  6  created successfully!
Storage used: 
total:  38.20 MB
Data of length  7  created successfully!
Storage used: 
total:  48.18 MB
Data of length  8  created successfully!
Storage used: 
total:  59.42 MB
Data of length  9  created successfully!
Storage used: 
total:  71.70 MB
Data of length  10  created successfully!
Storage used: 
total:  85.24 MB
Data of length  11  created successfully!
Storage used: 
total:  99.81 MB
Data of length  12  created successfully!
Storage used: 
total:  115.64 MB
Data of length  13  created successfully!
Storage used: 
total:  132.51 MB
Data of length  

In [12]:
# creating our validation Data
# WILL TAKE VERY LONG!!!



# making sure samples of various lengths are equally distributed
max_length_train = 40
min_length_train = 1
train_length_interval = max_length_train - min_length_train
current_length = min_length_train

rng_seed = 42

batches_per_problem = 10
train_batch_size = 32

# to first clear all data to have clean start
del_prev = True

# track used storage
total_storage_used = 0

for i in tqdm(range(min_length_train, max_length_train + 1)):

  # generating the data
  training_args = {"rng": jax.random.PRNGKey(rng_seed), "batch_size": train_batch_size, "length": current_length}
  storage_used_cs = generate_data("cs", cs_names, "validation", batches_per_problem, training_args, del_prev)
  storage_used_dcf = generate_data("dcf", dcf_names, "validation", batches_per_problem, training_args, del_prev)
  storage_used_regular = generate_data("regular", regular_names, "validation", batches_per_problem, training_args, del_prev)
  total_storage_used += storage_used_cs + storage_used_dcf + storage_used_regular

  # so we dont delete the data we actually want to preserve
  del_prev = False

  # print storage used to keep track of data mass
  print("Storage used: ")
  print("total: ", utils.readable(total_storage_used))
  print("Data of length ", current_length, " created successfully!")

  # increasing the length of the data samples to create variety in the data
  current_length += 1


  0%|          | 0/40 [00:00<?, ?it/s]

Storage used: 
total:  211.09 KB
Data of length  1  created successfully!
Storage used: 
total:  528.44 KB
Data of length  2  created successfully!
Storage used: 
total:  974.53 KB
Data of length  3  created successfully!
Storage used: 
total:  1.49 MB
Data of length  4  created successfully!
Storage used: 
total:  2.16 MB
Data of length  5  created successfully!
Storage used: 
total:  2.93 MB
Data of length  6  created successfully!
Storage used: 
total:  3.82 MB
Data of length  7  created successfully!
Storage used: 
total:  4.82 MB
Data of length  8  created successfully!
Storage used: 
total:  5.94 MB
Data of length  9  created successfully!
Storage used: 
total:  7.17 MB
Data of length  10  created successfully!
Storage used: 
total:  8.52 MB
Data of length  11  created successfully!
Storage used: 
total:  9.98 MB
Data of length  12  created successfully!
Storage used: 
total:  11.56 MB
Data of length  13  created successfully!
Storage used: 
total:  13.25 MB
Data of length  14  c

In [11]:
# creating our test Data
# WILL TAKE VERY LONG!!!



# making sure samples of various lengths are equally distributed
max_length_train = 500
min_length_train = 476
train_length_interval = max_length_train - min_length_train
current_length = min_length_train

rng_seed = 42

batches_per_problem = 10
train_batch_size = 32

# to first clear all data to have clean start
del_prev = False

# track used storage
total_storage_used = 0

for i in tqdm(range(min_length_train, max_length_train + 1)):

  # generating the data
  training_args = {"rng": jax.random.PRNGKey(rng_seed), "batch_size": train_batch_size, "length": current_length}
  storage_used_cs = generate_data("cs", cs_names, "testing", batches_per_problem, training_args, del_prev)
  storage_used_dcf = generate_data("dcf", dcf_names, "testing", batches_per_problem, training_args, del_prev)
  storage_used_regular = generate_data("regular", regular_names, "testing", batches_per_problem, training_args, del_prev)
  total_storage_used += storage_used_cs + storage_used_dcf + storage_used_regular

  # so we dont delete the data we actually want to preserve
  del_prev = False

  # print storage used to keep track of data mass
  print("Storage used: ")
  print("total: ", utils.readable(total_storage_used))
  print("Data of length ", current_length, " created successfully!")

  # increasing the length of the data samples to create variety in the data
  current_length += 1

  0%|          | 0/25 [00:00<?, ?it/s]

Storage used: 
total:  54.70 MB
Data of length  476  created successfully!
Storage used: 
total:  109.52 MB
Data of length  477  created successfully!
Storage used: 
total:  164.45 MB
Data of length  478  created successfully!
Storage used: 
total:  219.51 MB
Data of length  479  created successfully!
Storage used: 
total:  274.67 MB
Data of length  480  created successfully!
Storage used: 
total:  329.95 MB
Data of length  481  created successfully!
Storage used: 
total:  385.34 MB
Data of length  482  created successfully!
Storage used: 
total:  440.85 MB
Data of length  483  created successfully!
Storage used: 
total:  496.47 MB
Data of length  484  created successfully!
Storage used: 
total:  552.21 MB
Data of length  485  created successfully!
Storage used: 
total:  608.06 MB
Data of length  486  created successfully!
Storage used: 
total:  664.03 MB
Data of length  487  created successfully!
Storage used: 
total:  720.11 MB
Data of length  488  created successfully!
Storage used: