In [1]:
# !pip install "ray[tune]" optuna

In [2]:
# pip install -U ipywidgets

In [3]:
# !/usr/bin/python3 -m pip install virtualenv

In [4]:
# !apt-get install libcudnn8=8.6.* 

In [5]:
import json
import os
import tensorflow as tf
import gc
# Train final model with best parameters
from lstm_trainer import SegmentationTrainer
from lstm_tuner import LSTMTuner


2025-01-02 23:53:19.388272: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [7]:
# Function to run a complete experiment
def run_experiment(config_name, data_length_tune=40000, data_length_train=200000):
    config = {
        "model_name": config_name,
        "data_path": "/home/docker/data/work_projects/Segmentation-Models/data/valid_linearizations_4.csv",
        "save_path": "models_200_finals_100",
        "data_length": data_length_tune, 
        "label_smoothing": 0.1, 
        "clip_norm": 1.0,
        "latent_dim": 256,
        "batch_size": 64,
        "num_epochs": 60,
        "train_ratio": 0.8,
        "validation_ratio": 0.1,
        "test_ratio": 0.1,
        "random_seed": 20,
        "initial_lr": 0.00001,
        "decay_steps": 10000,
        "decay_rate": 0.9,
        "dropout_rate": 0.1,
        "patience": 5,
        "validation_split": 0.1
    }
    
    # Setup directories
    save_dir = os.path.join(os.getcwd(), config["save_path"], config["model_name"])
    os.makedirs(save_dir, exist_ok=True)
    config_path = os.path.join(save_dir, "config.json")
    print(f"\nStarting experiment for {config_name}")
    print(f"Saving config to: {config_path}")
    
    try:
        # Save initial config
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)
            
        # Run hyperparameter search
        print(f"Running hyperparameter search for {config_name}")
        tuner = LSTMTuner(config)
        best_config, analysis = tuner.run_hyperparameter_search(
            num_samples=10,
            num_epochs=5
        )
        
        # Update config with best parameters
        print(f"Applying best config and starting training for {config_name}")
        final_config = tuner.apply_best_config(best_config)
        final_config["data_length"] = data_length_train  # Full dataset for training
        final_config["batch_size"] = 64
        with open(config_path, 'w') as f:
            json.dump(final_config, f, indent=2)
        
        # Train final model
        trainer = SegmentationTrainer(config_path)
        model, data_processor = trainer.train()
        
        # Clear GPU memory
        tf.keras.backend.clear_session()
        gc.collect()
        
        print(f"Completed experiment for {config_name}")
        return model, data_processor
        
    except Exception as e:
        print(f"Error in experiment {config_name}: {str(e)}")
        raise
    finally:
        # Cleanup
        tf.keras.backend.clear_session()
        gc.collect()

# Run experiments sequentially
experiments = ["segmenter_one", "segmenter_two", "segmenter_three"]

for exp_name in experiments:
    print(f"\n{'='*50}")
    print(f"Starting experiment: {exp_name}")
    print(f"{'='*50}")
    
    model, data_processor = run_experiment(exp_name)
    
    # Force cleanup between experiments
    tf.keras.backend.clear_session()
    gc.collect()
    
    print(f"\nCompleted experiment: {exp_name}")
    print(f"{'='*50}\n")


Starting experiment: segmenter_one

Starting experiment for segmenter_one
Saving config to: /home/docker/data/work_projects/Segmentation-Models/lstm_model/models_200_finals_100/segmenter_one/config.json
Running hyperparameter search for segmenter_one


2025-01-02 23:53:25,881	INFO worker.py:1752 -- Started a local Ray instance.
2025-01-02 23:53:26,800	INFO packaging.py:530 -- Creating a file package for local directory '.'.
2025-01-02 23:53:28,563	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_20acd1d6b8826316.zip' (359.57MiB) to Ray cluster...
2025-01-02 23:53:30,223	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_20acd1d6b8826316.zip'.
2025-01-02 23:53:31,029	INFO tune.py:613 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
[I 2025-01-02 23:53:31,044] A new study created in memory with name: optuna


0,1
Current time:,2025-01-03 00:26:07
Running for:,00:32:34.56
Memory:,55.2/125.7 GiB

Trial name,status,loc,clip_norm,decay_rate,dropout_rate,initial_lr,label_smoothing,latent_dim,iter,total time (s),loss,val_loss,accuracy
_training_function_af5e871b,TERMINATED,172.17.0.2:23352,0.778893,0.842704,0.280409,6.64147e-05,0.0834157,1024,1,208.012,1.21456,1.1915,0.766591
_training_function_02f0a411,TERMINATED,172.17.0.2:23785,1.75058,0.848519,0.111023,0.00012672,0.0437269,512,1,202.83,0.961127,0.935547,0.786269
_training_function_8bd5d826,TERMINATED,172.17.0.2:24197,1.61315,0.814906,0.299426,1.27563e-05,0.0950836,512,1,190.817,1.41473,1.41812,0.708222
_training_function_aa464038,TERMINATED,172.17.0.2:24607,1.0588,0.942843,0.140675,2.39491e-05,0.082415,256,1,189.752,1.3579,1.34503,0.712234
_training_function_31bc40be,TERMINATED,172.17.0.2:25016,0.989628,0.97643,0.220166,0.000425512,0.0610973,128,1,175.851,0.920938,0.892096,0.825157
_training_function_35e6e726,TERMINATED,172.17.0.2:25424,0.66116,0.973698,0.124787,6.63273e-05,0.104765,512,1,192.802,1.3465,1.34147,0.743638
_training_function_5e50c16b,TERMINATED,172.17.0.2:25834,1.8712,0.815729,0.119823,0.000694709,0.180517,256,1,196.617,1.38521,1.37069,0.850875
_training_function_ebe0b694,TERMINATED,172.17.0.2:26244,1.09507,0.970566,0.150576,1.69669e-05,0.104985,128,1,175.796,1.51653,1.50058,0.695424
_training_function_13614e9a,TERMINATED,172.17.0.2:26652,1.94269,0.812057,0.219448,0.000184451,0.0115617,128,1,177.813,0.833273,0.789178,0.785678
_training_function_5b32e0b1,TERMINATED,172.17.0.2:27060,1.53047,0.873256,0.183687,2.91957e-05,0.0952002,256,1,184.654,1.40373,1.39285,0.712498




[36m(_training_function pid=23352)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=23352)[0m Cut-off dataset shape: (40000, 4)




Trial name,accuracy,loss,val_accuracy,val_loss
_training_function_02f0a411,0.786269,0.961127,0.79824,0.935547
_training_function_13614e9a,0.785678,0.833273,0.801685,0.789178
_training_function_31bc40be,0.825157,0.920938,0.833579,0.892096
_training_function_35e6e726,0.743638,1.3465,0.750629,1.34147
_training_function_5b32e0b1,0.712498,1.40373,0.717083,1.39285
_training_function_5e50c16b,0.850875,1.38521,0.856808,1.37069
_training_function_8bd5d826,0.708222,1.41473,0.705804,1.41812
_training_function_aa464038,0.712234,1.3579,0.717418,1.34503
_training_function_af5e871b,0.766591,1.21456,0.775484,1.1915
_training_function_ebe0b694,0.695424,1.51653,0.690737,1.50058




[36m(_training_function pid=23785)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=23785)[0m Cut-off dataset shape: (40000, 4)




[36m(_training_function pid=24197)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=24197)[0m Cut-off dataset shape: (40000, 4)




[36m(_training_function pid=24607)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=24607)[0m Cut-off dataset shape: (40000, 4)




[36m(_training_function pid=25016)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=25016)[0m Cut-off dataset shape: (40000, 4)




[36m(_training_function pid=25424)[0m Cut-off dataset shape: (40000, 4)
[36m(_training_function pid=25424)[0m Original dataset shape: (367178, 4)




[36m(_training_function pid=25834)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=25834)[0m Cut-off dataset shape: (40000, 4)




[36m(_training_function pid=26244)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=26244)[0m Cut-off dataset shape: (40000, 4)




[36m(_training_function pid=26652)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=26652)[0m Cut-off dataset shape: (40000, 4)




[36m(_training_function pid=27060)[0m Original dataset shape: (367178, 4)
[36m(_training_function pid=27060)[0m Cut-off dataset shape: (40000, 4)


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2025-01-03 00:26:07,556	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to '/home/docker/data/work_projects/Segmentation-Models/lstm_model/models_200_finals_100/segmenter_one/ray_results/lstm_tune' in 0.0129s.
2025-01-03 00:26:07,567	INFO tune.py:1048 -- Total run time: 1956.54 seconds (1954.54 seconds for the tuning loop).



Best trial config: {'model_name': 'segmenter_one', 'data_path': '/home/docker/data/work_projects/Segmentation-Models/data/valid_linearizations_4.csv', 'save_path': 'models_200_finals_100', 'data_length': 40000, 'label_smoothing': 0.011561664905338498, 'clip_norm': 1.9426870713984559, 'latent_dim': 128, 'batch_size': 64, 'num_epochs': 60, 'train_ratio': 0.8, 'validation_ratio': 0.1, 'test_ratio': 0.1, 'random_seed': 20, 'initial_lr': 0.0001844505568740933, 'decay_steps': 10000, 'decay_rate': 0.8120566806916044, 'dropout_rate': 0.2194481687960585, 'patience': 5, 'validation_split': 0.1, 'tuning_epochs': 5}
Best trial final validation loss: 0.7892
Best trial final validation accuracy: 0.8017
Applying best config and starting training for segmenter_one

Training Configuration:

Dataset:
- Data path: /home/docker/data/work_projects/Segmentation-Models/data/valid_linearizations_4.csv
- Dataset size: 200000

Model:
- Latent dim: 128
- Label smoothing: 0.011561664905338498
- Gradient clip nor

2025-01-03 00:26:18.755295: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46715 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:e1:00.0, compute capability: 8.6


Epoch 1/60
   2/2250 [..............................] - ETA: 2:49 - loss: 3.3032 - accuracy: 0.0389   

2025-01-03 00:26:26.267004: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 49/60

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 20/60

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 47/60

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60