In [1]:
import csv
import re

#####################################################
# 1. LOAD AFFIXES FROM CSV
#####################################################

def load_affixes_from_csv(filepath):
    affixes = []
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            affix_str = row['affix'].strip()
            affix_type = int(row['type'])
            affixes.append({
                'affix': affix_str,
                'type': affix_type
            })
    return affixes


#####################################################
# 2. CONSONANT MUTATION REVERSAL LOGIC (SKELETON)
#####################################################

# This dict tries to map a *mutated* boundary back to possible original
# prefix+root letter combos. In real practice, you'll need more nuance.
revert_boundary_map = {
    "mb":  [("n", "b")],
    "mp":  [("n", "p")],
    "ng":  [("n", "h")],
    "nk":  [("n", "h")],
    "nd":  [("n", "l")],
    "m":   [("n", "p"), ("n", "b"), ("n", "f"), ("n", "v"), ("n", "m")],
    "n":   [("n", "n")],
    "ndr": [("n", "r")],
    "nts": [("n", "s")],
    "nj":  [("n", "j")],
    # You can add more, especially for hyphen-based changes in compounds
}


#####################################################
# 3. TENSE VARIANTS FOR PREFIXES (SKELETON)
#####################################################

def generate_tense_variants(prefix_base):
    """
    Given a base prefix (e.g. "manka"), generate possible
    tense/polarity variants (e.g. "hanka", "nanka", etc.).
    This is only a demo; you'll want to expand it for
    ho-, no-, hi-, ni-, ha-, na-, etc.
    """
    variants = set([prefix_base])  # always include the original
    
    # If the prefix starts with 'm', then we can replace 'm' with 'h' or 'n'.
    if prefix_base.startswith("m"):
        variants.add("h" + prefix_base[1:])
        variants.add("n" + prefix_base[1:])
    else:
        # If it doesn't start with 'm', we might just prepend 'h' or 'n'.
        variants.add("h" + prefix_base)
        variants.add("n" + prefix_base)
        # Optionally add 'hi'+prefix_base, 'ho'+prefix_base, etc.
        # variants.add("hi" + prefix_base)
        # variants.add("ho" + prefix_base)
        # ...

    return variants


#####################################################
# 4. HELPER FUNCTIONS TO REMOVE EACH TYPE OF AFFIX
#####################################################

def try_remove_prefix(word, prefix):
    """
    Attempt to remove `prefix` from the start of `word`.
    - Also tries to handle boundary mutations if `prefix` ends with 'n'.
    - Returns (was_removed, new_word).
    """
    # 1) Direct match check
    if word.startswith(prefix):
        return True, word[len(prefix):]

    # 2) If direct match fails, try boundary mutation reversal
    #    Example: prefix ends with 'n' => might be n+b -> "mb", n+p -> "mp", etc.
    if prefix.endswith('n'):
        prefix_minus_n = prefix[:-1]
        for mutated_boundary, original_pairs in revert_boundary_map.items():
            # We see if the word starts with prefix_minus_n + mutated_boundary
            candidate = prefix_minus_n + mutated_boundary
            if word.startswith(candidate):
                # Then we guess it was "n"+"b" => "mb" or "n"+"p" => "mp", etc.
                # So we remove the entire chunk from the start of the word
                new_word = word[len(candidate):]
                return True, new_word

    return False, word


def try_remove_suffix(word, suffix):
    """
    Attempt to remove `suffix` from the end of `word`.
    - Similar boundary logic can be used if suffix starts with certain letters.
    - Returns (was_removed, new_word).
    """
    if word.endswith(suffix):
        return True, word[:-len(suffix)]
    
    # If needed, implement boundary logic for suffix
    # (like "mb", "mp" at the end, etc.)
    
    return False, word


def try_remove_circumfix(word, prefix_part, suffix_part):
    """
    Attempt to remove a circumfix: 'prefix_part' at the start, 
    'suffix_part' at the end. Example: "aha-ana" => prefix="aha", suffix="ana".
    - Returns (was_removed, new_word).
    """
    if word.startswith(prefix_part) and word.endswith(suffix_part):
        new_word = word[len(prefix_part):-len(suffix_part)]
        return True, new_word
    return False, word


#####################################################
# 5. THE MAIN RULE-BASED STEM FUNCTION
#####################################################

def rule_based_stem(word, affixes):

    changed = True
    while changed:
        changed = False

        # 1. Try to remove recognized prefixes (type=1).
        #    We'll also generate tense variants for each prefix and try those.
        for aff in affixes:
            if aff['type'] == 1:  # prefix
                base_prefix = aff['affix']
                variants = generate_tense_variants(base_prefix)
                # Try each variant
                for varpref in variants:
                    removed, new_word = try_remove_prefix(word, varpref)
                    if removed:
                        word = new_word
                        changed = True
                        break  # prefix removed, restart
                if changed:
                    break  # prefix removal succeeded, re-check from scratch
        
        if changed:
            continue  # re-check from start

        # 2. Try to remove recognized suffixes (type=2).
        for aff in affixes:
            if aff['type'] == 2:  # suffix
                removed, new_word = try_remove_suffix(word, aff['affix'])
                if removed:
                    word = new_word
                    changed = True
                    break
        if changed:
            continue

        # 3. Try to remove recognized circumfixes (type=3).
        #    e.g. "aha-ana" => prefix_part="aha", suffix_part="ana"
        for aff in affixes:
            if aff['type'] == 3:
                parts = aff['affix'].split('-')
                if len(parts) == 2:
                    prefix_part, suffix_part = parts
                    removed, new_word = try_remove_circumfix(word, prefix_part, suffix_part)
                    if removed:
                        word = new_word
                        changed = True
                        break
        if changed:
            continue
        
        # 4. Try to remove infixes (type=4), e.g. "-al-"
        for aff in affixes:
            if aff['type'] == 4:
                # For a minimal example, just remove the substring (without the dashes).
                infix_str = aff['affix'].strip('-')
                if infix_str in word:
                    word = word.replace(infix_str, "", 1)  # remove first occurrence
                    changed = True
                    break
        if changed:
            continue
        
        # 5. Try to remove special apostrophe-based suffixes (type=5), e.g. "-'ny"
        for aff in affixes:
            if aff['type'] == 5:
                # Example: "-'ny"
                # We can simply see if word ends with "'ny" after ignoring the leading dash.
                suffix_str = aff['affix'][1:]  # remove the leading '-'
                if word.endswith(suffix_str):
                    word = word[:-len(suffix_str)]
                    changed = True
                    break

    return word


#####################################################
# 6. SIMPLE DEMO / TEST
#####################################################

if __name__ == "__main__":
    # 1) Load the affixes from CSV
    affixes = load_affixes_from_csv("malagasy_affixes.csv")
    
    # 2) Some test words
    test_words = [
        "mankabidy",   # "manka" + "bidy" => possibly "bidy"
        "hankabidy",   # tense variant => "bidy"
        "mambabo",     # "man" + "babo" => mutated boundary = "mambabo" => root "babo"
        "abiliana",    # from your ML code example
        "mialana",     # possibly "mi" + "alana" or other combos
        "zaka-tsaka",  # compound/hyphen example (not fully handled here)
    ]
    
    print("Rule-Based Stemming Demo:\n")
    for w in test_words:
        stemmed = rule_based_stem(w, affixes)
        print(f"Word: {w:15s} => Stem: {stemmed}")


Rule-Based Stemming Demo:

Word: mankabidy       => Stem: bidy
Word: hankabidy       => Stem: bidy
Word: mambabo         => Stem: babo
Word: abiliana        => Stem: bi
Word: mialana         => Stem: 
Word: zaka-tsaka      => Stem: -tsaka


In [2]:
import os
import sqlite3
import csv

from sklearn.metrics import f1_score

#####################################################
# 1. LOAD DATA FROM THE DATABASE
#####################################################

def load_derivative_root_pairs(db_path="dictionary.db"):
    """
    Connect to the SQLite database and retrieve all derivative-root pairs.
    Returns a tuple (inflected_words, root_words) as lists of strings.
    """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("""
        SELECT d.derivative, r.root
        FROM derivatives d
        JOIN root_words r ON d.root_id = r.id
    """)
    data = cursor.fetchall()
    conn.close()

    if not data:
        print("No data found in the database.")
        return [], []

    inflected_words, root_words = zip(*data)  # unzip into two lists
    return list(inflected_words), list(root_words)


#####################################################
# 2. LOAD AFFIXES FROM CSV
#####################################################

def load_affixes_from_csv(filepath="affixes.csv"):
    """
    Load a list of affixes from a CSV with columns: "affix","type".
    Example rows:
       affix,type
       manka,1
       ana,2
       aha-ana,3
       -al-,4
       -'ny,5
    Returns a list of dicts like:
       [ {"affix": "manka", "type": 1},
         {"affix": "ana",   "type": 2},
         ... ]
    """
    affixes = []
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            affix_str = row['affix'].strip()
            affix_type = int(row['type'])
            affixes.append({
                'affix': affix_str,
                'type': affix_type
            })
    return affixes


#####################################################
# 3. CONSONANT MUTATION & TENSE VARIANTS (SKELETON)
#####################################################

# A map for "reverse-engineering" mutated boundary segments.
# Key = mutated boundary, Value = list of possible (prefix_char, root_char) combos.
revert_boundary_map = {
    "mb":  [("n", "b")],
    "mp":  [("n", "p")],
    "ng":  [("n", "h")],
    "nk":  [("n", "h")],
    "nd":  [("n", "l")],
    "m":   [("n", "p"), ("n", "b"), ("n", "f"), ("n", "v"), ("n", "m")],
    "n":   [("n", "n")],
    "ndr": [("n", "r")],
    "nts": [("n", "s")],
    "nj":  [("n", "j")],
}

def generate_tense_variants(prefix_base):
    """
    Return possible tense/polarity variants for a given prefix.
    For example, if prefix_base="manka", generate "manka", "hanka", "nanka", etc.
    Expand as needed (e.g. ho, no, hi, ni, ha, na).
    """
    variants = set([prefix_base])
    
    if prefix_base.startswith("m"):
        variants.add("h" + prefix_base[1:])
        variants.add("n" + prefix_base[1:])
    else:
        # Could also add hi-, ho-, ha-, ni-, no-, na- if appropriate
        variants.add("h" + prefix_base)
        variants.add("n" + prefix_base)
    
    return variants


#####################################################
# 4. HELPER FUNCTIONS TO STRIP AFFIXES
#####################################################

def try_remove_prefix(word, prefix):
    """
    Attempt to remove `prefix` from the start of `word`.
    Handles boundary mutation if prefix ends with 'n'.
    Returns: (was_removed, new_word).
    """
    # Check direct match
    if word.startswith(prefix):
        return True, word[len(prefix):]

    # If prefix ends with 'n', see if there's a mutated boundary
    if prefix.endswith('n'):
        prefix_minus_n = prefix[:-1]
        for mutated_boundary, original_pairs in revert_boundary_map.items():
            candidate = prefix_minus_n + mutated_boundary
            if word.startswith(candidate):
                # So we remove candidate from the start
                return True, word[len(candidate):]
    
    # If nothing matches
    return False, word


def try_remove_suffix(word, suffix):
    """
    Attempt to remove `suffix` from the end of `word`.
    (You can expand to handle boundary mutation at the end.)
    Returns: (was_removed, new_word).
    """
    if word.endswith(suffix):
        return True, word[:-len(suffix)]
    return False, word


def try_remove_circumfix(word, prefix_part, suffix_part):
    """
    Attempt to remove a circumfix: prefix_part ... suffix_part
    e.g., "aha-ana" => prefix="aha", suffix="ana".
    Returns: (was_removed, new_word).
    """
    if word.startswith(prefix_part) and word.endswith(suffix_part):
        new_word = word[len(prefix_part):-len(suffix_part)]
        return True, new_word
    return False, word


#####################################################
# 5. MAIN RULE-BASED STEM FUNCTION
#####################################################

def rule_based_stem(word, affixes):
    """
    Naive approach:
    1) Loop until no more changes:
       - Try removing recognized prefixes (with tense variants)
       - Try removing recognized suffixes
       - Try removing recognized circumfixes
       - Try removing recognized infixes
       - Try removing apostrophe-based suffixes
    2) Return final 'word' as stem.
    """
    changed = True
    while changed:
        changed = False

        # 1) Prefixes
        for aff in affixes:
            if aff['type'] == 1:  # prefix
                base_prefix = aff['affix']
                variants = generate_tense_variants(base_prefix)
                for varpref in variants:
                    removed, new_word = try_remove_prefix(word, varpref)
                    if removed:
                        word = new_word
                        changed = True
                        break
                if changed:
                    break
        if changed:
            continue

        # 2) Suffixes
        for aff in affixes:
            if aff['type'] == 2:  # suffix
                removed, new_word = try_remove_suffix(word, aff['affix'])
                if removed:
                    word = new_word
                    changed = True
                    break
        if changed:
            continue

        # 3) Circumfixes
        for aff in affixes:
            if aff['type'] == 3:
                parts = aff['affix'].split('-')
                if len(parts) == 2:
                    prefix_part, suffix_part = parts
                    removed, new_word = try_remove_circumfix(word, prefix_part, suffix_part)
                    if removed:
                        word = new_word
                        changed = True
                        break
        if changed:
            continue

        # 4) Infixes
        for aff in affixes:
            if aff['type'] == 4:
                # e.g. affix might be "-al-"
                infix_str = aff['affix'].strip('-')
                if infix_str in word:
                    # remove first occurrence
                    word = word.replace(infix_str, "", 1)
                    changed = True
                    break
        if changed:
            continue

        # 5) Apostrophe-based suffixes (type=5)
        for aff in affixes:
            if aff['type'] == 5:
                # e.g. "-'ny"
                suffix_str = aff['affix'][1:]  # remove the leading '-'
                if word.endswith(suffix_str):
                    word = word[:-len(suffix_str)]
                    changed = True
                    break

    return word


#####################################################
# 6. EVALUATION (Exact-match F1)
#####################################################

def evaluate_rule_based_exact(inflected_words, root_words, affixes):
    """
    Check how often the rule-based stem equals the true root (exact match).
    Returns float F1 (which for binary exact-match is same as precision=recall=F1).
    """
    y_true = []
    y_pred = []
    for derived, gold_root in zip(inflected_words, root_words):
        guess_root = rule_based_stem(derived, affixes)
        # We'll say "1" if the guess == gold, "0" otherwise
        y_pred.append(1 if guess_root == gold_root else 0)
        y_true.append(1)  # always 1 for gold
    f1 = f1_score(y_true, y_pred)
    return f1


#####################################################
# 7. MAIN PROGRAM
#####################################################

if __name__ == "__main__":
    # 1) Load the derivative-root pairs from the database
    db_path = "dictionary.db"
    inflected_words, root_words = load_derivative_root_pairs(db_path)

    if not inflected_words:
        print("No data to process. Exiting.")
        exit()

    print(f"Loaded {len(inflected_words)} derivative-root pairs from '{db_path}'.")

    # 2) Load the affixes from CSV
    affixes_csv_path = "malagasy_affixes.csv"
    affixes = load_affixes_from_csv(affixes_csv_path)
    print(f"Loaded {len(affixes)} affixes from '{affixes_csv_path}'.")

    # 3) Evaluate the rule-based approach (Exact-match F1)
    f1_exact = evaluate_rule_based_exact(inflected_words, root_words, affixes)
    print(f"Rule-Based Exact-Match F1: {f1_exact:.4f}")

    # 4) Demo on a single sample
    sample_word = "mankabidy"
    sample_stem = rule_based_stem(sample_word, affixes)
    print(f"Example: {sample_word} => {sample_stem}")


Loaded 44118 derivative-root pairs from 'dictionary.db'.
Loaded 200 affixes from 'malagasy_affixes.csv'.
Rule-Based Exact-Match F1: 0.0825
Example: mankabidy => bidy
