In [1]:
from datetime import datetime
import json
import os
import shutil
from crf_segmenter import MorphologicalSegmentation

def run_experiments():
    working_dir = os.getcwd()

    experiments = ["segmenter_one", "segmenter_two", "segmenter_three"]    
    
    for exp_name in experiments:
        print(f"\n{'='*50}")
        print(f"Running experiment: {exp_name}")
        print(f"{'='*50}\n")
        
        config_path = os.path.join(working_dir, "generic_config_file.json")
        print(f"Attempting to load from: {config_path}")
        
        try:
            with open(config_path, "r") as f:
                loaded_config = json.load(f)
            print(f"Successfully loaded configuration for {exp_name}:")
        except FileNotFoundError:
            print(f"Error: Configuration file not found for {exp_name} at {config_path}")
            continue
        except json.JSONDecodeError as e:
            print(f"Error: Failed to decode JSON for {exp_name}. Details: {e}")
            continue
            
        loaded_config['model_name'] = loaded_config.get('model_name', exp_name)
        
        segmenter = MorphologicalSegmentation(loaded_config)
        
        print("\nFinding best parameters...")
        best_params = segmenter.find_best_parameters()
        print(f"Best parameters found for {exp_name}: {best_params}")
        
        print("\nTraining with updated parameters...")
        segmenter.train_with_parameters(best_params)
        print(f"\n{'='*50}")
        print(f"Completed running experiment: {exp_name}")
        print(f"{'='*50}\n")

if __name__ == "__main__":
    run_experiments()


Running experiment: segmenter_one

Attempting to load from: /workspace/segmentation-models/crf_model/generic_config_file.json
Successfully loaded configuration for segmenter_one:
Configuration saved to /workspace/segmentation-models/crf_model/models_finals/segmenter_one/config.json
Initialized Morphological Segmentation with model name: segmenter_one

Finding best parameters...
Starting grid search for model: segmenter_one
Configuration saved to /workspace/segmentation-models/crf_model/models_finals/segmenter_one/config.json
Initialized Morphological Segmentation with model name: segmenter_one
Loading data from ../data/valid_linearizations_4.csv
Original dataset shape: (367178, 4)
Available columns: ['tokens', 'segmenter_one', 'segmenter_two', 'segmenter_three']

Using columns for segmenter_one:
  tokens segmenter_one
0    aba          ab-a
1  ababa       a-bab-a

Cut-off dataset shape: (40000, 2)

Data split statistics:
  Training samples: 31999
  Development samples: 4001
  Test sam

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


Model saved to /workspace/segmentation-models/crf_model/models_finals/segmenter_one/segmenter_one.pkl

Completed running experiment: segmenter_one


Running experiment: segmenter_two

Attempting to load from: /workspace/segmentation-models/crf_model/generic_config_file.json
Successfully loaded configuration for segmenter_two:
Configuration saved to /workspace/segmentation-models/crf_model/models_finals/segmenter_two/config.json
Initialized Morphological Segmentation with model name: segmenter_two

Finding best parameters...
Starting grid search for model: segmenter_two
Configuration saved to /workspace/segmentation-models/crf_model/models_finals/segmenter_two/config.json
Initialized Morphological Segmentation with model name: segmenter_two
Loading data from ../data/valid_linearizations_4.csv
Original dataset shape: (367178, 4)
Available columns: ['tokens', 'segmenter_one', 'segmenter_two', 'segmenter_three']

Using columns for segmenter_two:
  tokens segmenter_two
0    aba          ab-