In [3]:
import torch
import os
import json
from seq2seq_train import Seq2SeqTrainer

file_path = os.path.join("models_final_200/segmenter_three", "model_config.json")
print(f"Attempting to load from: {file_path}")

if not os.path.exists(file_path):
    print("File does not exist!")
else:
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            loaded_config = json.load(f)
            print("Config loaded successfully:", loaded_config)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


# Reinitialize trainer with new dataset size but keeping optimized parameters
trainer = Seq2SeqTrainer(loaded_config)


Attempting to load from: models_final_200/segmenter_three/model_config.json
Config loaded successfully: {'datasource': '../segmentation-models/data/valid_linearizations_4.csv', 'batch_size': 64, 'num_epochs': 60, 'lr': 0.00031050988283798424, 'plateau_patience': 10, 'worsen_patience': 5, 'min_delta': 0.0005, 'verbose': True, 'data_length': 200000, 'seq_len': 50, 'd_model': 128, 'num_layers': 4, 'num_heads': 8, 'd_ff': 512, 'dropout': 0.21404304809404834, 'max_grad_norm': 1.1190287094689317, 'label_smoothing': 0.0008008147200706972, 'vocab_size': 50, 'tokenize_custom': {'tokens': True, 'segmenter_three': True}, 'tokenize_method': 'character', 'file_path': 'models_final_200/segmenter_three', 'model_folder': 'weights', 'lang_src': 'tokens', 'lang_tgt': 'segmenter_three', 'model_basename': 'model_', 'preload': 'latest', 'tokenizer_file': 'tokenizers_{0}.json', 'experiment_name': 'models_final_200/segmenter_three/tensor_data', 'random_seed': 20, 'saved_timestamp': '2025-02-13T00:34:57.31695

  state = torch.load(model_filename)


Loaded best model from epoch 48
Model configuration:
- d_model: 128
- num_layers: 4
- num_heads: 8
- d_ff: 512
- dropout: 0.21404304809404834
- label_smoothing: 0.0008008147200706972
- max_grad_norm: 1.1190287094689317
- lr: 0.00031050988283798424


In [4]:
trainer.segment_sentence("yezimpawu")

'ye-zimpawu'

### to segment data a csv file

In [None]:
from seq2seq_train import Seq2SeqTrainer
import pandas as pd
def batch_segment_with_models(sentences_df, config_paths, text_column='zulu', save_interval=2000):
    """
    Segment multiple sentences using multiple models, with periodic saves to distinct files
    """
    results = {f'segmenter_{i}': [] for i in range(1, len(config_paths) + 1)}
    skipped_words = {f'segmenter_{i}': [] for i in range(1, len(config_paths) + 1)}
    # sentences = sentences_df[text_column].tolist() 
    sentences = sentences_df
    for i, config_path in enumerate(config_paths, 1):
        print(f"\nProcessing with Segmenter {i}")
        with open(config_path, "r") as f:
            config = json.load(f)

    
        trainer = Seq2SeqTrainer(config)
        
        # Initialize model-specific results file
        model_filename = f'diversified_data_{config["lang_tgt"]}.csv'
        columns = ['original', config["lang_tgt"]]
        pd.DataFrame(columns=columns).to_csv(model_filename, index=False)

        for j, sentence in enumerate(sentences, 1):
            words = sentence.strip().split()
            segmented_words = []
            current_skipped = []
            for word in words:
                # Check token length before processing
                token_length = len(trainer.tokenizer_src.encode(word).ids) + 2

                if token_length > 50:
                    print(f"\nSkipping word '{word}' (length: {token_length})")
                    current_skipped.append({
                        'word': word,
                        'length': token_length,
                        'sentence_idx': j-1,
                        'sentence': sentence
                    })
                    segmented_words.append(f"[SKIPPED:{word}]")
                else:
                    segmented = trainer.translate(word).strip()
                    segmented_words.append(segmented)
            
            result = " ".join(segmented_words)
            results[f'segmenter_{i}'].append(result)
            skipped_words[f'segmenter_{i}'].extend(current_skipped)
            
            print(f"Processing sentence {j}/{len(sentences)} with {config['lang_tgt']}", end='\r')
            
            # Save progress periodically
            if j % save_interval == 0 or j == len(sentences):
                print(f"\nSaving progress at sentence {j} for {config['lang_tgt']}...")
                
                current_df = pd.DataFrame({
                    'original': sentences[:j],
                    config["lang_tgt"]: results[f'segmenter_{i}'][:j]  
                })
                
                # Save updated results
                current_df.to_csv(model_filename, index=False)
                
                # Save skipped words
                if current_skipped:
                    pd.DataFrame(skipped_words[f'segmenter_{i}']).to_csv(
                        f'skipped_words_{config["lang_tgt"]}.csv', index=False
                    )
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    final_results = {'isizulu': sentences}
    
    for i, config_path in enumerate(config_paths, 1):
        with open(config_path, "r") as f:
            config = json.load(f)
        final_results[config["lang_tgt"]] = results[f'segmenter_{i}']
    
    final_df = pd.DataFrame(final_results)
    final_df.to_csv('diversified_data.csv', index=False)
    
    return final_df, skipped_words


with open("random_sample_500k.txt", "r") as file:
    sentences = file.readlines()

config_paths = [
    "models_final_200/segmenter_one/model_config.json",
    "models_final_200/segmenter_two/model_config.json",
    "models_final_200/segmenter_three/model_config.json"
]

results_df, skipped_words = batch_segment_with_models(
    sentences, 
    config_paths, 
    text_column='zulu',
    save_interval=10000
)


Processing with Segmenter 1
Using device: cuda
Device name: NVIDIA A100 80GB PCIe
Device memory: 79.253662109375 GB
Original shape: (367178, 4)
Final shape: (200000, 4)
The config file has been saved on models_final_200/segmenter_one
Tokenizer path: models_final_200/segmenter_one/tokenizers/tokenizers_tokens.json
Loading existing tokenizer from models_final_200/segmenter_one/tokenizers/tokenizers_tokens.json
Tokenizer path: models_final_200/segmenter_one/tokenizers/tokenizers_segmenter_one.json
Loading existing tokenizer from models_final_200/segmenter_one/tokenizers/tokenizers_segmenter_one.json
the dataset length: 200000
Max length of source sentence: 30
Max length of target sentence: 41
Loaded best model from epoch 20
Model configuration:
- d_model: 128
- num_layers: 3
- num_heads: 16
- d_ff: 1024
- dropout: 0.18953066758095358
- label_smoothing: 0.000949605759638339
- max_grad_norm: 0.5246939924778026
- lr: 0.0004800819901770108


  state = torch.load(model_filename)


Processing sentence 10000/500000 with segmenter_one
Saving progress at sentence 10000 for segmenter_one...
Processing sentence 18990/500000 with segmenter_one

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Processing sentence 62462/500000 with segmenter_one

In [61]:
import json
import pandas as pd
from seq2seq_train import Seq2SeqTrainer

def segment_words(words, config_paths):
    """
    Segments a list of words using multiple segmentation models.
    Returns a dictionary with segmented outputs.
    """
    segmented_results = {f'segmenter_{i}': [] for i in range(1, len(config_paths) + 1)}

    for i, config_path in enumerate(config_paths, 1):
        print(f"\nProcessing with Segmenter {i}")

        with open(config_path, "r") as f:
            config = json.load(f)
    
        trainer = Seq2SeqTrainer(config)
        
        for word in words:
            segmented = trainer.translate(word).strip()
            segmented_results[f'segmenter_{i}'].append(segmented)

    return segmented_results

# Load dataset
df = pd.read_csv("1.csv", delimiter=";")

# Get all unique words from word_a and word_b
unique_words = list(set(df["word_a"].tolist() + df["word_b"].tolist()))

# Define paths to your segmentation models
config_paths = [
    "models_final_200/segmenter_one/model_config.json",
    "models_final_200/segmenter_two/model_config.json",
    "models_final_200/segmenter_three/model_config.json"
]

# Segment all words
segmented_words = segment_words(unique_words, config_paths)

# Convert results into a DataFrame
segmented_df = pd.DataFrame(segmented_words)
segmented_df["original_word"] = unique_words

# Merge with original dataset
df = df.merge(segmented_df, left_on="word_a", right_on="original_word", how="left")
df.rename(columns={"segmenter_1": "word_a_segmented_1", "segmenter_2": "word_a_segmented_2", "segmenter_3": "word_a_segmented_3"}, inplace=True)
df.drop(columns=["original_word"], inplace=True)

df = df.merge(segmented_df, left_on="word_b", right_on="original_word", how="left")
df.rename(columns={"segmenter_1": "word_b_segmented_1", "segmenter_2": "word_b_segmented_2", "segmenter_3": "word_b_segmented_3"}, inplace=True)
df.drop(columns=["original_word"], inplace=True)
df.drop(columns='score', inplace=True)

# Save segmented results
df.to_csv("segmented_words_pairs.csv", index=False)
print("Segmentation complete. Results saved to segmented_words.csv")


Processing with Segmenter 1
Using device: cuda
Device name: NVIDIA A100 80GB PCIe
Device memory: 79.253662109375 GB
Original shape: (367178, 4)
Final shape: (200000, 4)
The config file has been saved on models_final_200/segmenter_one
Tokenizer path: models_final_200/segmenter_one/tokenizers/tokenizers_tokens.json
Loading existing tokenizer from models_final_200/segmenter_one/tokenizers/tokenizers_tokens.json
Tokenizer path: models_final_200/segmenter_one/tokenizers/tokenizers_segmenter_one.json
Loading existing tokenizer from models_final_200/segmenter_one/tokenizers/tokenizers_segmenter_one.json
the dataset length: 200000
Max length of source sentence: 30
Max length of target sentence: 41
Loaded best model from epoch 20
Model configuration:
- d_model: 128
- num_layers: 3
- num_heads: 16
- d_ff: 1024
- dropout: 0.18953066758095358
- label_smoothing: 0.000949605759638339
- max_grad_norm: 0.5246939924778026
- lr: 0.0004800819901770108


  state = torch.load(model_filename)



Processing with Segmenter 2
Using device: cuda
Device name: NVIDIA A100 80GB PCIe
Device memory: 79.253662109375 GB
Original shape: (367178, 4)
Final shape: (200000, 4)
The config file has been saved on models_final_200/segmenter_two
Tokenizer path: models_final_200/segmenter_two/tokenizers/tokenizers_tokens.json
Loading existing tokenizer from models_final_200/segmenter_two/tokenizers/tokenizers_tokens.json
Tokenizer path: models_final_200/segmenter_two/tokenizers/tokenizers_segmenter_two.json
Loading existing tokenizer from models_final_200/segmenter_two/tokenizers/tokenizers_segmenter_two.json
the dataset length: 200000
Max length of source sentence: 30
Max length of target sentence: 39


  state = torch.load(model_filename)


Loaded best model from epoch 6
Model configuration:
- d_model: 256
- num_layers: 6
- num_heads: 16
- d_ff: 1024
- dropout: 0.27119126273901606
- label_smoothing: 0.014005492657761388
- max_grad_norm: 1.1819095309366932
- lr: 0.0004197711726342262

Processing with Segmenter 3
Using device: cuda
Device name: NVIDIA A100 80GB PCIe
Device memory: 79.253662109375 GB
Original shape: (367178, 4)
Final shape: (200000, 4)
The config file has been saved on models_final_200/segmenter_three
Tokenizer path: models_final_200/segmenter_three/tokenizers/tokenizers_tokens.json
Loading existing tokenizer from models_final_200/segmenter_three/tokenizers/tokenizers_tokens.json
Tokenizer path: models_final_200/segmenter_three/tokenizers/tokenizers_segmenter_three.json
Loading existing tokenizer from models_final_200/segmenter_three/tokenizers/tokenizers_segmenter_three.json
the dataset length: 200000
Max length of source sentence: 30
Max length of target sentence: 36
Loaded best model from epoch 48
Model c

  state = torch.load(model_filename)


KeyError: "['score'] not found in axis"