<a href="https://colab.research.google.com/github/Vageeswari-kanchiuniv/C/blob/main/sanskrit_word_split_rule_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**üîπ Python Code Example**

In [1]:
import re

# Basic Sandhi rules for splitting
sandhi_rules = [
    # Rule: 'o' before apostrophe (‡§Ω) often from "‡§É + ‡§Ö"
    (r"‡•ã‡§Ω", "‡§É + ‡§Ö"),
    # Rule: '‡•ã' at end can be "‡§É"
    (r"‡•ã ", "‡§É "),
    # Rule: '‡§É' + vowel
    (r"‡§É([‡§Ö‡§Ü‡§á‡§à‡§â‡§ä‡§è‡§ê‡§ì‡§î])", r"‡§É + \1"),
    # Rule: "‡§æ‡§∞‡•ç" before "‡§ú" could be "‡§æ + ‡§Ö‡§∞‡•ç‡§ú"
    (r"‡§æ‡§∞‡•ç‡§ú", "‡§æ + ‡§Ö‡§∞‡•ç‡§ú"),
    # Rule: consonant cluster simplification (simple demo)
    (r"‡§µ‡§¶", " + ‡§µ‡§¶"),
]

def sandhi_split(word):
    result = word
    for pattern, repl in sandhi_rules:
        result = re.sub(pattern, repl, result)
    return result

# Examples
examples = [
    "‡§∞‡§æ‡§Æ‡•ã‡§Ω‡§∏‡•ç‡§§‡§ø",   # ‡§∞‡§æ‡§Æ‡§É + ‡§Ö‡§∏‡•ç‡§§‡§ø
    "‡§ó‡•Å‡§∞‡•Å‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç", # ‡§ó‡•Å‡§∞‡•Å: + ‡§â‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç
    "‡§∂‡§ø‡§µ‡•ã‡§Ω‡§π‡§Æ‡•ç",   # ‡§∂‡§ø‡§µ‡§É + ‡§Ö‡§π‡§Æ‡•ç
    "‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§Ω‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç", # ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ + ‡§Ö‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç
    "‡§∏‡§§‡•ç‡§Ø‡§Ç‡§µ‡§¶"     # ‡§∏‡§§‡•ç‡§Ø‡§Ç + ‡§µ‡§¶
]

for w in examples:
    print(f"{w}  ‚Üí  {sandhi_split(w)}")


‡§∞‡§æ‡§Æ‡•ã‡§Ω‡§∏‡•ç‡§§‡§ø  ‚Üí  ‡§∞‡§æ‡§Æ‡§É + ‡§Ö‡§∏‡•ç‡§§‡§ø
‡§ó‡•Å‡§∞‡•Å‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç  ‚Üí  ‡§ó‡•Å‡§∞‡•Å‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç
‡§∂‡§ø‡§µ‡•ã‡§Ω‡§π‡§Æ‡•ç  ‚Üí  ‡§∂‡§ø‡§µ‡§É + ‡§Ö‡§π‡§Æ‡•ç
‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§Ω‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç  ‚Üí  ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§Ω‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç
‡§∏‡§§‡•ç‡§Ø‡§Ç‡§µ‡§¶  ‚Üí  ‡§∏‡§§‡•ç‡§Ø‡§Ç + ‡§µ‡§¶


**Python Code ‚Äì Sandhi-Vicheda Engine**

In [1]:
import re

# ---------------------------
# Simple Sanskrit Dictionary (expand as needed)
# ---------------------------
dictionary = {
    "‡§∞‡§æ‡§Æ‡§É", "‡§Ö‡§∏‡•ç‡§§‡§ø", "‡§ó‡•Å‡§∞‡•Å‡§É", "‡§â‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç", "‡§∂‡§ø‡§µ‡§É", "‡§Ö‡§π‡§Æ‡•ç",
    "‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ", "‡§Ö‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç", "‡§∏‡§§‡•ç‡§Ø‡§Ç", "‡§µ‡§¶", "‡§≤‡•ã‡§ï‡§É", "‡§ú‡§®‡§É"
}

# ---------------------------
# Sandhi Rules (Reversal Patterns)
# Each rule is (pattern, replacement, description)
# ---------------------------
sandhi_rules = [
    # Visarga Sandhi
    (r"‡•ã‡§Ω", "‡§É + ‡§Ö", "Visarga + vowel (o‡§Ω = ‡§É + ‡§Ö)"),
    (r"‡•ã ", "‡§É ", "Visarga at end"),
    (r"‡§É([‡§Ö‡§Ü‡§á‡§à‡§â‡§ä‡§è‡§ê‡§ì‡§î])", r"‡§É + \1", "Visarga + vowel"),

    # Vowel Sandhi
    (r"‡§Ü‡§Ö", "‡§Ü + ‡§Ö", "Long A before A"),
    (r"‡§è‡§Ö", "‡§è + ‡§Ö", "E before A"),
    (r"‡§ì‡§Ö", "‡§ì + ‡§Ö", "O before A"),
    (r"‡§æ‡§Ω", "‡§æ + ‡§Ö", "ƒÅ + a (vidyƒÅ‡§Ωrjanam)"),

    # Consonant Sandhi (basic demo)
    (r"‡§Ç‡§µ", "‡§Ç + ‡§µ", "AnusvƒÅra before v"),
    (r"‡§Ç‡§Ø", "‡§Ç + ‡§Ø", "AnusvƒÅra before y"),
    (r"‡§Ç‡§∞", "‡§Ç + ‡§∞", "AnusvƒÅra before r"),
]

# ---------------------------
# Apply Sandhi Rules
# ---------------------------
def apply_rules(word):
    """Generate possible splits by applying sandhi rules"""
    splits = []
    for pattern, repl, desc in sandhi_rules:
        if re.search(pattern, word):
            candidate = re.sub(pattern, repl, word)
            splits.append((candidate, desc))
    return splits

# ---------------------------
# Validate against dictionary
# ---------------------------
def validate_split(split_text):
    """Check if split words exist in dictionary"""
    words = [w.strip() for w in split_text.split("+")]
    return all(word in dictionary for word in words)

# ---------------------------
# Main Sandhi-Vicheda function
# ---------------------------
def sandhi_vicheda(word):
    results = []
    candidates = apply_rules(word)

    for cand, rule in candidates:
        if validate_split(cand):
            results.append((cand, rule))

    # If nothing validated, return raw candidates
    if not results:
        return [(cand, rule) for cand, rule in candidates]

    return results

# ---------------------------
# Test Examples
# ---------------------------
examples = [
    "‡§∞‡§æ‡§Æ‡•ã‡§Ω‡§∏‡•ç‡§§‡§ø",    # ‡§∞‡§æ‡§Æ‡§É + ‡§Ö‡§∏‡•ç‡§§‡§ø
    "‡§ó‡•Å‡§∞‡•Å‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç", # ‡§ó‡•Å‡§∞‡•Å‡§É + ‡§â‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç
    "‡§∂‡§ø‡§µ‡•ã‡§Ω‡§π‡§Æ‡•ç",    # ‡§∂‡§ø‡§µ‡§É + ‡§Ö‡§π‡§Æ‡•ç
    "‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§Ω‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç", # ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ + ‡§Ö‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç
    "‡§∏‡§§‡•ç‡§Ø‡§Ç‡§µ‡§¶"     # ‡§∏‡§§‡•ç‡§Ø‡§Ç + ‡§µ‡§¶
]

for word in examples:
    print(f"\nWord: {word}")
    results = sandhi_vicheda(word)
    if results:
        for res, rule in results:
            print(f"  ‚Üí {res}   (Rule: {rule})")
    else:
        print("  ‚Üí No split found")



Word: ‡§∞‡§æ‡§Æ‡•ã‡§Ω‡§∏‡•ç‡§§‡§ø
  ‚Üí ‡§∞‡§æ‡§Æ‡§É + ‡§Ö‡§∏‡•ç‡§§‡§ø   (Rule: Visarga + vowel (o‡§Ω = ‡§É + ‡§Ö))

Word: ‡§ó‡•Å‡§∞‡•Å‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç
  ‚Üí No split found

Word: ‡§∂‡§ø‡§µ‡•ã‡§Ω‡§π‡§Æ‡•ç
  ‚Üí ‡§∂‡§ø‡§µ‡§É + ‡§Ö‡§π‡§Æ‡•ç   (Rule: Visarga + vowel (o‡§Ω = ‡§É + ‡§Ö))

Word: ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§Ω‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç
  ‚Üí ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ + ‡§Ö‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç   (Rule: ƒÅ + a (vidyƒÅ‡§Ωrjanam))

Word: ‡§∏‡§§‡•ç‡§Ø‡§Ç‡§µ‡§¶
  ‚Üí ‡§∏‡§§‡•ç‡§Ø‡§Ç + ‡§µ‡§¶   (Rule: AnusvƒÅra before v)


**üîπ Recursive Sandhi-Vicheda Engine (Python)**

In [4]:
import re

# ---------------------------
# Simple Sanskrit Dictionary (expand as needed)
# ---------------------------
dictionary = {
    "‡§∞‡§æ‡§Æ‡§É", "‡§Ö‡§∏‡•ç‡§§‡§ø", "‡§ó‡•Å‡§∞‡•Å‡§É", "‡§â‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç", "‡§â‡§™‡§¶‡•á‡§∂‡§É", "‡§∂‡§ø‡§µ‡§É", "‡§Ö‡§π‡§Æ‡•ç",
    "‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ", "‡§Ö‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç", "‡§∏‡§§‡•ç‡§Ø‡§Ç", "‡§µ‡§¶", "‡§≤‡•ã‡§ï‡§É", "‡§ú‡§®‡§É", "‡§ú‡•ç‡§û‡§æ‡§®‡§Æ‡•ç"
}

# ---------------------------
# Sandhi Rules (Reversal Patterns)
# ---------------------------
sandhi_rules = [
    # Visarga Sandhi
    (r"‡•ã‡§Ω", "‡§É + ‡§Ö"),
    (r"‡•ã", "‡§É"),
    (r"‡§É([‡§Ö‡§Ü‡§á‡§à‡§â‡§ä‡§è‡§ê‡§ì‡§î])", r"‡§É + \1"),

    # Vowel Sandhi
    (r"‡§æ‡§Ω", "‡§æ + ‡§Ö"),
    (r"‡§Ü‡§Ö", "‡§Ü + ‡§Ö"),
    (r"‡§è‡§Ö", "‡§è + ‡§Ö"),
    (r"‡§ì‡§Ö", "‡§ì + ‡§Ö"),

    # Anusvara Sandhi
    (r"‡§Ç([‡§µ‡§Ø‡§∞‡§≤])", r"‡§Ç + \1"),
]

# ---------------------------
# Recursive splitter
# ---------------------------
def recursive_split(word, depth=0):
    """Try to recursively split a word into dictionary words."""
    results = []

    # Base case: if the whole word is in the dictionary
    if word in dictionary:
        return [[word]]

    # Try applying all sandhi rules
    for pattern, repl in sandhi_rules:
        if re.search(pattern, word):
            candidate = re.sub(pattern, repl, word, count=1)

            # Only split if we actually got a "+" in the result
            if "+" not in candidate:
                continue

            parts = [w.strip() for w in candidate.split("+", 1)]
            if len(parts) != 2:
                continue

            left_options = recursive_split(parts[0], depth+1)
            right_options = recursive_split(parts[1], depth+1)

            for left in left_options:
                for right in right_options:
                    results.append(left + right)

    return results

# ---------------------------
# Wrapper function
# ---------------------------
def sandhi_vicheda(word):
    splits = recursive_split(word)
    # Remove duplicates
    unique_splits = []
    for s in splits:
        if s not in unique_splits:
            unique_splits.append(s)
    return unique_splits

# ---------------------------
# Test Examples
# ---------------------------
examples = [
    "‡§∞‡§æ‡§Æ‡•ã‡§Ω‡§∏‡•ç‡§§‡§ø",      # ‡§∞‡§æ‡§Æ‡§É + ‡§Ö‡§∏‡•ç‡§§‡§ø
    "‡§ó‡•Å‡§∞‡•Å‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç",   # ‡§ó‡•Å‡§∞‡•Å‡§É + ‡§â‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç
    "‡§∂‡§ø‡§µ‡•ã‡§Ω‡§π‡§Æ‡•ç",      # ‡§∂‡§ø‡§µ‡§É + ‡§Ö‡§π‡§Æ‡•ç
    "‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§Ω‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç", # ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ + ‡§Ö‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç
    "‡§∏‡§§‡•ç‡§Ø‡§Ç‡§µ‡§¶",       # ‡§∏‡§§‡•ç‡§Ø‡§Ç + ‡§µ‡§¶
    "‡§≤‡•ã‡§ï‡§ú‡§®‡§É"        # ‡§≤‡•ã‡§ï‡§É + ‡§ú‡§®‡§É
]

for word in examples:
    print(f"\nWord: {word}")
    results = sandhi_vicheda(word)
    if results:
        for r in results:
            print("  ‚Üí", " + ".join(r))
    else:
        print("  ‚Üí No split found")



Word: ‡§∞‡§æ‡§Æ‡•ã‡§Ω‡§∏‡•ç‡§§‡§ø
  ‚Üí ‡§∞‡§æ‡§Æ‡§É + ‡§Ö‡§∏‡•ç‡§§‡§ø

Word: ‡§ó‡•Å‡§∞‡•Å‡§™‡§¶‡•á‡§∂‡§æ‡§§‡•ç
  ‚Üí No split found

Word: ‡§∂‡§ø‡§µ‡•ã‡§Ω‡§π‡§Æ‡•ç
  ‚Üí ‡§∂‡§ø‡§µ‡§É + ‡§Ö‡§π‡§Æ‡•ç

Word: ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§Ω‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç
  ‚Üí ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ + ‡§Ö‡§∞‡•ç‡§ú‡§®‡§Æ‡•ç

Word: ‡§∏‡§§‡•ç‡§Ø‡§Ç‡§µ‡§¶
  ‚Üí ‡§∏‡§§‡•ç‡§Ø‡§Ç + ‡§µ‡§¶

Word: ‡§≤‡•ã‡§ï‡§ú‡§®‡§É
  ‚Üí No split found
