In [1]:
# !pip install sklearn_crfsuite, nltk

In [16]:
# Required imports
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import sys
import time
import pickle
import warnings
import sklearn_crfsuite
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.chrf_score import corpus_chrf
from nltk.metrics import precision, recall, f_measure
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [17]:
MODEL_NAME = "segmenter_one"
pwd = os.getcwd()
model_path = os.path.join(pwd, "models", MODEL_NAME)
os.makedirs(model_path, exist_ok=True)
print(f"Directory created: {model_path}")

Directory created: /workspace/segmentation-models/crf_model/models/segmenter_three


In [18]:
def surface_segment_data_preparation(word_dictionary: dict):
    """
    Generate features for surface segmentation
    :param word_dictionary: Dictionary with words as keys and their BMES labels as values
    :return: Features (X), Labels (Y), and word characters
    """
    X = []  # Features for each word
    Y = []  # Labels for each word
    words = []  # Original words
    
    for word, label in word_dictionary.items():
        if len(word) != len(label):
            warnings.warn(f"Skipping word {word} due to length mismatch with label {label}")
            continue
            
        word_features = []  # Features for each character in word
        
        for i in range(len(word)):
            features = {}
            
            # Basic character features
            features["char"] = word[i]
            features["lower"] = word[i].lower()
            
            # Position features
            features["start"] = i == 0
            features["end"] = i == len(word) - 1
            features["position"] = i
            features["word_length"] = len(word)
            
            # Context window features (larger window)
            for j in range(-3, 4):  # -3 to +3 window
                if 0 <= i + j < len(word):
                    features[f"char_{j}"] = word[i + j]
                    features[f"is_vowel_{j}"] = word[i + j].lower() in 'aeiou'
            
            # N-gram features
            for n in range(1, 4):  # Unigrams, bigrams, and trigrams
                # Previous n-grams
                if i >= n:
                    features[f"prev_{n}gram"] = word[i-n:i]
                # Next n-grams
                if i + n <= len(word):
                    features[f"next_{n}gram"] = word[i:i+n]
            
            # Character type features
            features["is_vowel"] = word[i].lower() in 'aeiou'
            features["is_consonant"] = word[i].lower() not in 'aeiou'
            
            # Complex pattern features
            if i > 0:
                features["prev_is_vowel"] = word[i-1].lower() in 'aeiou'
                features["char_pair"] = word[i-1:i+1]
            if i < len(word) - 1:
                features["next_is_vowel"] = word[i+1].lower() in 'aeiou'
            
            # Add syllable-like features
            if i > 0 and i < len(word) - 1:
                prev_char = word[i-1].lower()
                curr_char = word[i].lower()
                next_char = word[i+1].lower()
                features["syllable_pattern"] = (
                    ("V" if prev_char in 'aeiou' else "C") +
                    ("V" if curr_char in 'aeiou' else "C") +
                    ("V" if next_char in 'aeiou' else "C")
                )
            
            word_features.append(features)
        
        X.append(word_features)
        Y.append(list(label))
        words.append([char for char in word])
    
    return X, Y, words
    
def evaluate_surface_segmentation(Y_pred, Y_true, test_tokens, test_segments):
    """
    Evaluate surface segmentation performance
    """
    predictions = []
    for token, pred_labels in zip(test_tokens, Y_pred):
        # Convert BMES labels to segmentation
        segmented = []
        for char, label in zip(token, pred_labels):
            segmented.append(char)
            if label in ['E', 'S']:
                segmented.append('-')
        pred_segmented = ''.join(segmented).rstrip('-')
        predictions.append(pred_segmented)


    return predictions, test_segments

        
def surface_labelled_data_preparation(word_dictionary: dict):
    """
    Generate features for segment labelling
    :param word_dictionary: Dictionary with segmented words as keys and their labels as values
    :return: Features (X), Labels (Y), and words
    """
    X = []  # Features for each word
    Y = []  # Labels for each word
    words = []  # Original words
    
    for word, labels in word_dictionary.items():
        segments = word.split('-')
        label_segments = labels.split('-')
        
        if len(segments) != len(label_segments):
            warnings.warn(f"Skipping {word} due to segment/label mismatch")
            continue
            
        segment_features = []
        
        for i, segment in enumerate(segments):
            features = {}
            
            # Basic segment features
            features['segment'] = segment
            features['length'] = len(segment)
            features['position'] = str(i)
            features['position_pct'] = str(i / len(segments))
            
            # Lexical features
            features['segment.lower()'] = segment.lower()
            features['prefix_3'] = segment[:3] if len(segment) >= 3 else segment
            features['suffix_3'] = segment[-3:] if len(segment) >= 3 else segment
            
            # Character type features
            features['has_vowel'] = any(c.lower() in 'aeiou' for c in segment)
            features['all_consonants'] = all(c.lower() not in 'aeiou' for c in segment)
            features['has_upper'] = any(c.isupper() for c in segment)
            features['is_single_char'] = len(segment) == 1
            
            # Context features
            if i > 0:
                features['prev_segment'] = segments[i-1]
                features['prev_length'] = len(segments[i-1])
            if i < len(segments) - 1:
                features['next_segment'] = segments[i+1]
                features['next_length'] = len(segments[i+1])
            
            # Pattern features
            features['starts_with_vowel'] = segment[0].lower() in 'aeiou'
            features['ends_with_vowel'] = segment[-1].lower() in 'aeiou'
            features['consonant_pattern'] = ''.join('C' if c.lower() not in 'aeiou' else 'V' 
                                                  for c in segment)
            
            segment_features.append(features)
        
        X.append(segment_features)
        Y.append(label_segments)
        words.append(word)
    
    return X, Y, words

def surface_labelled_data_preparation_pipeline(word_list: list):
    """
    Generate features for pipeline segment labelling
    :param word_list: List of segmented words
    :return: List of features for each segmented word
    """
    X = []
    
    for word in word_list:
        segments = word.split('-')
        segment_features = []
        
        for i, segment in enumerate(segments):
            features = {}
            
            # Basic segment features
            features['segment'] = segment
            features['length'] = len(segment)
            features['position'] = str(i)
            features['position_pct'] = str(i / len(segments))
            
            # Lexical features
            features['segment.lower()'] = segment.lower()
            features['prefix_3'] = segment[:3] if len(segment) >= 3 else segment
            features['suffix_3'] = segment[-3:] if len(segment) >= 3 else segment
            
            # Character type features
            features['has_vowel'] = any(c.lower() in 'aeiou' for c in segment)
            features['all_consonants'] = all(c.lower() not in 'aeiou' for c in segment)
            features['has_upper'] = any(c.isupper() for c in segment)
            features['is_single_char'] = len(segment) == 1
            
            # Context features
            if i > 0:
                features['prev_segment'] = segments[i-1]
                features['prev_length'] = len(segments[i-1])
            if i < len(segments) - 1:
                features['next_segment'] = segments[i+1]
                features['next_length'] = len(segments[i+1])
            
            # Pattern features
            features['starts_with_vowel'] = segment[0].lower() in 'aeiou'
            features['ends_with_vowel'] = segment[-1].lower() in 'aeiou'
            features['consonant_pattern'] = ''.join('C' if c.lower() not in 'aeiou' else 'V' 
                                                  for c in segment)
            
            segment_features.append(features)
            
        X.append(segment_features)
    
    return X



def save_evaluation_results(model_path, position_scores, bleu_scores, chrf_score):
    # Create results dictionary
    results = {
        'Position Sensitive Metrics': {
            'Precision': position_scores['precision'],
            'Recall': position_scores['recall'],
            'F1': position_scores['f1']
        },
        'BLEU Scores': {
            'Unigram': bleu_scores['unigram'],
            'Bigram': bleu_scores['bigram'],
            'Equal Weights': bleu_scores['equal']
        },
        'chrF Score': chrf_score
    }
    
    # Save to text file
    with open(f'{model_path}/evaluation_results.txt', 'w') as f:
        
        f.write(f"=== Morphological Segmentation Evaluation Results CRF - {MODEL_NAME} ===\n\n")
        
        # Position sensitive scores
        f.write("Position-sensitive scores:\n")
        f.write(f"Precision: {results['Position Sensitive Metrics']['Precision']:.3f}\n")
        f.write(f"Recall: {results['Position Sensitive Metrics']['Recall']:.3f}\n")
        f.write(f"F1: {results['Position Sensitive Metrics']['F1']:.3f}\n\n")
        
        # BLEU scores
        f.write("BLEU Scores:\n")
        f.write(f"Unigram only: {results['BLEU Scores']['Unigram']:.4f}\n")
        f.write(f"Bigram only: {results['BLEU Scores']['Bigram']:.4f}\n")
        f.write(f"Equal weights: {results['BLEU Scores']['Equal Weights']:.4f}\n\n")
        
        # chrF score
        f.write(f"chrF Score: {results['chrF Score']:.4f}\n")

def eval_morph_segments_position(predicted, target):

    predicted= [word.split('-') for word in predicted]
    target = [word.split('-') for word in target]
    
    correct = 0.0
    assert len(predicted)==len(target)
    
    # Iterate through predicted and target words
    for pred, targ in zip(predicted, target):
        # Create enumerated pairs to track position
        pred_with_pos = list(enumerate(pred))
        targ_with_pos = list(enumerate(targ))
        
        # Check matches at same positions
        for pos, p_morph in pred_with_pos:
            # Look for match at same position in target
            if pos < len(targ) and p_morph == targ[pos]:
                correct += 1
    
    predicted_length = sum([len(pred) for pred in predicted])
    target_length = sum([len(targ) for targ in target])
    
    precision = correct/predicted_length
    recall = correct/target_length
    f_score = 2 * (precision * recall)/(precision + recall) if (precision + recall) > 0 else 0
    
    print("Position-sensitive scores:")
    print("P: ", round(precision*100,3),
          "R: ", round(recall*100,3),
          "F1: ", round(f_score*100,3))
    
    # Return scores as dictionary
    return {
        'precision': precision,
        'recall': recall,
        'f1': f_score
    }

def eval_bleu_segment(predicted, targeted):
    reference = [[word.split('-')] for word in targeted]
    candidate = [word.split('-') for word in predicted]
    
    smoothie = SmoothingFunction().method2
    
    bleu_scores = {
        'unigram': corpus_bleu(reference, candidate, 
                              weights=(1, 0, 0, 0), 
                              smoothing_function=smoothie),
        'bigram': corpus_bleu(reference, candidate, 
                             weights=(0, 1, 0, 0), 
                             smoothing_function=smoothie),
        'equal': corpus_bleu(reference, candidate, 
                            weights=(0.5, 0.5, 0, 0), 
                            smoothing_function=smoothie)
    }

    return bleu_scores

def eval_chrF_segment(predicted, targeted):

    target = [" ".join(word.split("-")) for word in targeted]
    predicted = [" ".join(word.split("-")) for word in predicted]
    
    chrf_score = corpus_chrf(target, predicted, min_len=1, max_len=6, beta=3.0)

    return chrf_score


class BaselineCRF:
    """
    Modified Baseline CRF to work with CSV data containing token and segmentation pairs
    """
    def _get_unsegmented_token(self, segmented):
        """
        Generate unsegmented token from segmented form
        """
        return ''.join(segmented.split('-'))

    def _generate_bmes_labels(self, row):
        """
        Generate BMES labels for a token based on its segmentation
        
        :param row: DataFrame row containing segmented form
        :return: String of BMES labels
        """
        segmented = row[MODEL_NAME]
        # Generate the actual token by removing hyphens
        token = self._get_unsegmented_token(segmented)
        segments = segmented.split('-')
        
        # Initialize empty label string
        labels = ''
        current_pos = 0
        
        # Generate labels for each segment
        for segment in segments:
            segment_len = len(segment)
            
            # Skip empty segments
            if segment_len == 0:
                continue
                
            # Single character segment
            if segment_len == 1:
                labels += 'S'
            # Multi-character segment
            else:
                labels += 'B'  # Beginning
                for _ in range(segment_len - 2):
                    labels += 'M'  # Middle
                labels += 'E'  # End
            
            current_pos += segment_len
        
        # Verify label length matches token length
        if len(labels) != len(token):
            print(f"Mismatch for segmentation: {segmented}")
            print(f"Generated labels: {labels}")
            print(f"Unsegmented token: {token}")
            print(f"Label length: {len(labels)}, Token length: {len(token)}\n")
            return None
            
        return labels

    def _preprocess_dataframe(self, df):
        """
        Preprocess the dataframe to ensure data quality
        """
        # Clean up the data
        df = df.copy()
        df[MODEL_NAME] = df[MODEL_NAME].str.strip()
        
        # Generate tokens from segmentations
        df['tokens'] = df[MODEL_NAME].apply(self._get_unsegmented_token)
        
        # Generate BMES labels
        df['bmes_labels'] = df.apply(self._generate_bmes_labels, axis=1)
        
        # Remove rows where label generation failed
        df = df.dropna(subset=['bmes_labels'])
        
        return df

    def __init__(self, file_path: str):
        """
        Initialize the CRF with a CSV file containing tokens and their segmentations
        
        :param file_path: Path to the CSV file containing the data
        """
        # Read the CSV file
        try:
            df = pd.read_csv(file_path, header=None, names=['tokens', MODEL_NAME]).head(200000)
        except Exception as e:
            raise ValueError(f"Error reading CSV file: {str(e)}")
        
        # Preprocess the data
        df = self._preprocess_dataframe(df)
        
        if len(df) == 0:
            raise ValueError("No valid data remains after preprocessing")
            
        print(f"Total valid samples after preprocessing: {len(df)}")

        
        # Split into train (70%), dev (15%), and test (15%)
        train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)
        dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
        
        # Convert to dictionary format {token: bmes_labels}
        self.training_data = dict(zip(train_data['tokens'], train_data['bmes_labels']))
        self.dev_data = dict(zip(dev_data['tokens'], dev_data['bmes_labels']))
        self.test_data = dict(zip(test_data['tokens'], test_data['bmes_labels']))
        
        # Store dictionaries in a list for easy access
        self.data_splits = [self.training_data, self.dev_data, self.test_data]
        
        # Store original segmentations for evaluation
        self.training_segments = dict(zip(train_data['tokens'], train_data[MODEL_NAME]))
        self.dev_segments = dict(zip(dev_data['tokens'], dev_data[MODEL_NAME]))
        self.test_segments = dict(zip(test_data['tokens'], test_data[MODEL_NAME]))

        print(f"Data loaded successfully:")
        print(f"Training samples: {len(self.training_data)}")
        print(f"Development samples: {len(self.dev_data)}")
        print(f"Test samples: {len(self.test_data)}")

    def train_surface_model(self, use_grid_search=True):
        X_training, Y_training, _ = surface_segment_data_preparation(self.training_data)
        X_dev, Y_dev, _ = surface_segment_data_preparation(self.dev_data)
        
        if use_grid_search:
            param_grid = {
                'c1': [0.1, 0.2, 0.3, 0.4],
                'c2': [0.1, 0.2, 0.3, 0.4],
                'max_iterations': [100, 300, 500],
                'min_freq': [2, 3, 4]
            }
            
            crf = sklearn_crfsuite.CRF(
                algorithm='lbfgs',
                all_possible_transitions=True
            )
            
            f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted')
            
            grid_search = GridSearchCV(
                estimator=crf,
                param_grid=param_grid,
                scoring=f1_scorer,
                cv=2,
                verbose=1,
                n_jobs=-1
            )
            
            print("Running grid search...")
            grid_search.fit(X_training, Y_training)
            print("Best parameters:", grid_search.best_params_)
            print("Best F1 score:", grid_search.best_score_)
            
            return grid_search.best_estimator_
        else:

            print("using the vanilla model")
            
            crf = sklearn_crfsuite.CRF(
                algorithm='lbfgs',
                c1=0.2,
                c2=0.3,
                max_iterations=500,
                all_possible_transitions=True,
                min_freq=3,
                verbose=True
            )
            crf.fit(X_training, Y_training)
            return crf

In [None]:
file_path = '../data/valid_linearizations_2.csv'

# Initialize the CRF
crf = BaselineCRF(file_path)
print("CRF model initialized successfully")

# Train new model
print("Training improved model...")
surface_model = crf.train_surface_model(use_grid_search=False)

# Get test data
test_tokens = list(crf.test_data.keys())
test_segments = list(crf.test_segments.values())

# Make predictions with new model
X_test, Y_test, _ = surface_segment_data_preparation(crf.test_data)
Y_pred = surface_model.predict(X_test)

In [20]:
# Evaluate
predicted, targeted = evaluate_surface_segmentation(Y_pred, Y_test, test_tokens, test_segments)

In [21]:
# P, F, R, BASED
pos_scores = eval_morph_segments_position(predicted, targeted)
pos_scores

bleu_scores = eval_bleu_segment(predicted, targeted)
bleu_scores

chrf_score = eval_chrF_segment(predicted, targeted)
chrf_score

save_evaluation_results(model_path, pos_scores, bleu_scores, chrf_score)

Position-sensitive scores:
P:  78.556 R:  79.15 F1:  78.852


{'precision': 0.785560745397831,
 'recall': 0.7914963604683353,
 'f1': 0.788517382910725}

In [22]:
bleu_scores = eval_bleu_segment(predicted, targeted)
bleu_scores

{'unigram': 0.8470738448111454,
 'bigram': 0.7757422021796317,
 'equal': 0.8106237905357612}

In [23]:
chrf_score = eval_chrF_segment(predicted, targeted)
chrf_score

0.9968397291196388

In [24]:
save_evaluation_results(model_path, pos_scores, bleu_scores, chrf_score)

In [25]:
def save_model(model, filepath='zulu_morphological_segmenter.pkl'):
    """
    Save the trained CRF model
    
    Args:
        model: Trained CRF model
        filepath: Path where to save the model
    """
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model successfully saved to {filepath}")
        print(f"File size: {os.path.getsize(filepath) / (1024*1024):.2f} MB")
    except Exception as e:
        print(f"Error saving model: {str(e)}")

def load_model(filepath='zulu_morphological_segmenter.pkl'):
    """
    Load a saved CRF model
    
    Args:
        filepath: Path to the saved model
    Returns:
        Loaded CRF model
    """
    try:
        with open(filepath, 'rb') as f:
            model = pickle.load(f)
        print(f"Model successfully loaded from {filepath}")
        return model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None

# Save the model
print("Saving model...")
save_model(surface_model, f"{model_path}/{MODEL_NAME}.pkl")

Saving model...
Model successfully saved to /workspace/segmentation-models/crf_model/models/segmenter_three/segmenter_three.pkl
File size: 0.78 MB


In [12]:
# # Example usage
# if __name__ == "__main__":
#     # Specify your CSV file path
#     file_path = '../data/unique_sorted_validated_words_newer_v2.csv'
    
#     # Initialize the CRF
#     try:
#         crf = BaselineCRF(file_path)
#         print("CRF model initialized successfully")

#         # Train new model
#         print("Training improved model...")
#         surface_model = crf.train_surface_model(crf.training_data)
        
#         # Get test data
#         test_tokens = list(crf.test_data.keys())
#         test_segments = list(crf.test_segments.values())

#         # Make predictions with new model
#         X_test, Y_test, _ = crf.surface_segment_data_preparation(crf.test_data)
#         Y_pred = surface_model.predict(X_test)
        
#         # Evaluate
#         evaluate_surface_segmentation(Y_pred, Y_test, test_tokens, test_segments)
        
#     except Exception as e:
#         print(f"Error initializing CRF model: {str(e)}")

In [13]:

# For parameter tuning...

# file_path = '../data/valid_linearizations_2.csv'
# model = BaselineCRF(file_path)
# crf = model.train_surface_model(use_grid_search=True)

In [14]:
# def predict_word_segmentation(surface_model, word):
#     """
#     Predict segmentation for a custom word
    
#     Args:
#         surface_model: Trained CRF model
#         word: Word to segment
#     Returns:
#         Segmented version of the word
#     """
#     # Prepare features
#     features = []
#     for i in range(len(word)):
#         char_features = {}
        
#         # Basic character features
#         char_features["char"] = word[i]
#         char_features["lower"] = word[i].lower()
        
#         # Position features
#         char_features["start"] = i == 0
#         char_features["end"] = i == len(word) - 1
#         char_features["position"] = i
#         char_features["word_length"] = len(word)
        
#         # Context window features
#         for j in range(-3, 4):  # -3 to +3 window
#             if 0 <= i + j < len(word):
#                 char_features[f"char_{j}"] = word[i + j]
#                 char_features[f"is_vowel_{j}"] = word[i + j].lower() in 'aeiou'
        
#         # N-gram features
#         for n in range(1, 4):
#             # Previous n-grams
#             if i >= n:
#                 char_features[f"prev_{n}gram"] = word[i-n:i]
#             # Next n-grams
#             if i + n <= len(word):
#                 char_features[f"next_{n}gram"] = word[i:i+n]
        
#         # Character type features
#         char_features["is_vowel"] = word[i].lower() in 'aeiou'
#         char_features["is_consonant"] = word[i].lower() not in 'aeiou'
        
#         # Complex pattern features
#         if i > 0:
#             char_features["prev_is_vowel"] = word[i-1].lower() in 'aeiou'
#             char_features["char_pair"] = word[i-1:i+1]
#         if i < len(word) - 1:
#             char_features["next_is_vowel"] = word[i+1].lower() in 'aeiou'
        
#         # Syllable-like features
#         if i > 0 and i < len(word) - 1:
#             prev_char = word[i-1].lower()
#             curr_char = word[i].lower()
#             next_char = word[i+1].lower()
#             char_features["syllable_pattern"] = (
#                 ("V" if prev_char in 'aeiou' else "C") +
#                 ("V" if curr_char in 'aeiou' else "C") +
#                 ("V" if next_char in 'aeiou' else "C")
#             )
        
#         features.append(char_features)
    
#     # Get predictions
#     predictions = surface_model.predict([features])[0]
    
#     # Convert to segmented form
#     segmented = []
#     for char, label in zip(word, predictions):
#         segmented.append(char)
#         if label in ['E', 'S']:
#             segmented.append('-')
    
#     return ''.join(segmented).rstrip('-')

# # Example usage:
# # Test with some custom words
# test_words = [
#     "ngiyabonga",
#     "uyajabula",
#     "sizohamba",
#     "ngizokuthanda"
# ]

# print("Testing custom words:")
# for word in test_words:
#     segmented = predict_word_segmentation(surface_model, word)
#     print(f"Word: {word}")
#     print(f"Predicted segmentation: {segmented}")
#     print()

# # Interactive testing
# def interactive_testing():
#     print("\nEnter words to segment (type 'quit' to exit):")
#     while True:
#         word = input("\nEnter word: ").strip()
#         if word.lower() == 'quit':
#             break
#         if word:
#             segmented = predict_word_segmentation(surface_model, word)
#             print(f"Predicted segmentation: {segmented}")

# # Run interactive testing
# interactive_testing()

In [15]:



# # Test loading the model

# print("\nTesting model loading...")
# loaded_model = load_model(f"{model_path}/{MODEL_NAME}.pkl")

# if loaded_model:
#     print("\nTesting loaded model with a sample word...")
#     test_word = "ngiyabonga"
#     segmented = predict_word_segmentation(loaded_model, test_word)
#     print(f"Word: {test_word}")
#     print(f"Predicted segmentation: {segmented}")