In [None]:
pip install visual-automata

Collecting visual-automata
  Downloading visual_automata-1.1.1-py3-none-any.whl.metadata (14 kB)
Collecting automata-lib (from visual-automata)
  Downloading automata_lib-9.1.2-py3-none-any.whl.metadata (4.9 kB)
Collecting colormath (from visual-automata)
  Downloading colormath-3.0.0.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jupyterlab (from visual-automata)
  Downloading jupyterlab-4.4.5-py3-none-any.whl.metadata (16 kB)
Collecting forbiddenfruit (from visual-automata)
  Downloading forbiddenfruit-0.1.4.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cached_method>=0.1.0 (from automata-lib->visual-automata)
  Downloading cached_method-0.1.0-py3-none-any.whl.metadata (2.9 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->visual-automata)
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (

In [None]:
# Import the necessary class from the library
from visual_automata.fa.dfa import VisualDFA

# 1. Define the DFA using the visual_automata library.
# This object holds the DFA's structure and is used for visualization.
dfa = VisualDFA(
    states={"q0", "q1", "q_dead"},
    input_symbols={"letter", "other"},
    transitions={
        "q0": {"letter": "q1", "other": "q_dead"},
        "q1": {"letter": "q1", "other": "q_dead"},
        "q_dead": {"letter": "q_dead", "other": "q_dead"},
    },
    initial_state="q0",
    final_states={"q1"},
)

# 2. Create a reliable simulation function.
# This loop correctly uses our "letter" and "other" categories.
def run_simulation(input_str):
    """
    Manually simulates the DFA and returns True if accepted, False otherwise.
    """
    current_state = dfa.initial_state

    for char in input_str:
        # First, map the actual character to our abstract symbols
        if 'a' <= char <= 'z':
            symbol = 'letter'
        else:
            symbol = 'other'

        # Then, find the next state from the transitions table
        current_state = dfa.transitions[current_state][symbol]

    # Check if the final state is an accepting state
    return current_state in dfa.final_states

# 3. Test the strings and print the required output
print("--- DFA Test Results ---")
test_cases = ["cat", "dog", "a", "zebra", "dog1", "1dog", "DogHouse", "Dog_house", " cats", ""]

for test_str in test_cases:
    if run_simulation(test_str):
        result = "Accepted"
    else:
        result = "Not Accepted"
    print(f"Input: '{test_str}' -> Output: {result}")

# 4. Generate and save the visual diagram from the DFA object
dfa.show_diagram(filename="dfa_diagram")

print("\n--- Visualization ---")
print("DFA diagram has been saved as 'dfa_diagram.png'")

--- DFA Test Results ---
Input: 'cat' -> Output: Accepted
Input: 'dog' -> Output: Accepted
Input: 'a' -> Output: Accepted
Input: 'zebra' -> Output: Accepted
Input: 'dog1' -> Output: Not Accepted
Input: '1dog' -> Output: Not Accepted
Input: 'DogHouse' -> Output: Not Accepted
Input: 'Dog_house' -> Output: Not Accepted
Input: ' cats' -> Output: Not Accepted
Input: '' -> Output: Not Accepted

--- Visualization ---
DFA diagram has been saved as 'dfa_diagram.png'


In [None]:
import pandas as pd

def analyze_noun(word):
    """Analyzes a noun to find its root form and grammatical features."""

    # --- New Logic: Check for irregular nouns first ---
    irregular_nouns = {
        # Plural -> root+N+PL
        "men": "man+N+PL",
        "women": "woman+N+PL",
        "children": "child+N+PL",
        "teeth": "tooth+N+PL",
        "feet": "foot+N+PL",
        "mice": "mouse+N+PL",
        "geese": "goose+N+PL",
        "people": "person+N+PL",
        # Add singular forms for completeness
        "man": "man+N+SG",
        "woman": "woman+N+SG",
        "child": "child+N+SG",
        "tooth": "tooth+N+SG",
        "foot": "foot+N+SG",
        "mouse": "mouse+N+SG",
        "goose": "goose+N+SG",
        "person": "person+N+SG",
    }
    if word in irregular_nouns:
        return irregular_nouns[word]

    # --- Original Logic for regular nouns ---
    if not isinstance(word, str) or not word.islower() or not word.isalpha():
        return "Invalid Word"

    if word.endswith('es'):
        if word.endswith(('shes', 'ches', 'xes', 'zes', 'ses')):
            return f"{word[:-2]}+N+PL"

    if word.endswith('ies') and len(word) > 3:
        return f"{word[:-3] + 'y'}+N+PL"

    if word.endswith('s'):
        sibilants = ('s', 'x', 'z')
        if word.endswith(('chs', 'shs')):
            return "Invalid Word"
        if len(word) > 1 and word[-2] in sibilants:
            return "Invalid Word"
        return f"{word[:-1]}+N+PL"

    return f"{word}+N+SG"

def process_file_to_parquet(input_filename, output_filename):
    """Reads nouns from a file and saves their analysis to a compressed Parquet file."""
    results = []
    try:
        with open(input_filename, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    analysis = analyze_noun(word)
                    results.append({"noun": word, "analysis": analysis})

        df = pd.DataFrame(results)
        df.to_parquet(output_filename, index=False, compression="snappy")
        print(f"✅ Saved {len(df)} nouns to '{output_filename}' (compressed with snappy)")

    except FileNotFoundError:
        print(f"Error: The file '{input_filename}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run
process_file_to_parquet("brown_nouns.txt", "noun_analysis.parquet")

✅ Saved 202793 nouns to 'noun_analysis.parquet' (compressed with snappy)


In [None]:
import pandas as pd

# Read the data from the Parquet file
df_results = pd.read_parquet("noun_analysis.parquet")

# Print the first 5 rows to check the contents
print(df_results)

                  noun            analysis
0        investigation  investigation+N+SG
1              primary        primary+N+SG
2             election       election+N+SG
3             evidence       evidence+N+SG
4       irregularities   irregularity+N+PL
...                ...                 ...
202788           gauge          gauge+N+SG
202789          glance         glance+N+SG
202790          figure         figure+N+SG
202791          boucle         boucle+N+SG
202792           dress        Invalid Word

[202793 rows x 2 columns]


In [None]:
import pandas as pd

def analyze_noun(w):
    """Analyzes a noun to find its root form and grammatical features."""
    # --- Check for irregular nouns first ---
    irregular_nouns = {
        "men": "man+N+PL", "women": "woman+N+PL", "children": "child+N+PL",
        "teeth": "tooth+N+PL", "feet": "foot+N+PL", "mice": "mouse+N+PL",
        "geese": "goose+N+PL", "people": "person+N+PL", "man": "man+N+SG",
        "woman": "woman+N+SG", "child": "child+N+SG", "tooth": "tooth+N+SG",
        "foot": "foot+N+SG", "mouse": "mouse+N+SG", "goose": "goose+N+SG",
        "person": "person+N+SG",
    }
    if w in irregular_nouns:
        return irregular_nouns[w]

    # --- Logic for regular nouns ---
    if not isinstance(w, str) or not w.islower() or not w.isalpha():
        return "Invalid Word"

    if w.endswith('es'):
        if w.endswith(('shes', 'ches', 'xes', 'zes', 'ses')):
            return f"{w[:-2]}+N+PL"

    if w.endswith('ies') and len(w) > 3:
        return f"{w[:-3] + 'y'}+N+PL"

    if w.endswith('s'):
        sibilants = ('s', 'x', 'z')
        if w.endswith(('chs', 'shs')):
            return "Invalid Word"
        if len(w) > 1 and w[-2] in sibilants:
            return "Invalid Word"
        return f"{w[:-1]}+N+PL"

    return f"{w}+N+SG"

def process_file_to_csv(input_fn, output_fn):
    """Reads nouns from a file and saves their analysis to a CSV file."""
    results = []
    try:
        # This will ONLY read your file.
        with open(input_fn, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    analysis = analyze_noun(word)
                    results.append({"noun": word, "analysis": analysis})

        df = pd.DataFrame(results)
        df.to_csv(output_fn, index=False)

        print(f"✅ Saved {len(df)} nouns to '{output_fn}'")

    except FileNotFoundError:
        print(f"Error: The file '{input_fn}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- This will run the analysis on your existing file ---
process_file_to_csv("brown_nouns.txt", "noun_analysis.csv")

✅ Saved 202793 nouns to 'noun_analysis.csv'


In [None]:
df = pd.read_csv('/content/noun_analysis.csv')
df

Unnamed: 0,noun,analysis
0,investigation,investigation+N+SG
1,primary,primary+N+SG
2,election,election+N+SG
3,evidence,evidence+N+SG
4,irregularities,irregularity+N+PL
...,...,...
202788,gauge,gauge+N+SG
202789,glance,glance+N+SG
202790,figure,figure+N+SG
202791,boucle,boucle+N+SG


In [None]:
import pandas as pd

"""
-----------------------------------------------------------------------
FORMAL FST DESIGN for Morphological Analysis of Nouns
-----------------------------------------------------------------------

This FST is designed with state-minimization techniques, using a central
'stem' state and branching only for special plural endings.

1.  Input Alphabet (Σ):
    - The set of lowercase English letters {a, b, c, ..., z}

2.  Output Alphabet (Γ):
    - Σ ∪ {+, N, S, G, P, L, ε} (where ε is the empty string)

3.  States (Q):
    - q0: Start state
    - q_stem: Main state for processing the word's root
    - q_s, q_x, q_z, q_sh, q_ch: States for sibilant endings
    - q_c: Intermediate state for detecting 'ch'
    - q_e: Remembers an 'e' has been seen after a sibilant
    - q_ies: Recognizes the '...ies' ending
    - q_final: The single, final accepting state

4.  Start State: q0

5.  Final State (F): {q_final}

6.  State Transition Table (δ):
| Current State | Input | Output    | Next State | Description                                               |
|:--------------|:------|:----------|:-----------|:----------------------------------------------------------|
| q0            | c_any | c_any     | q_stem     | Start processing the word.                                |
| q_stem        | s     | ε         | q_s        | Potential plural 's' or singular ending in 's'.           |
| q_stem        | x, z  | x, z      | q_x, q_z   | Sibilant ending found.                                    |
| q_stem        | c     | c         | q_c        | Potential 'ch' ending.                                    |
| q_stem        | i     | ε         | q_ies      | Potential 'ies' ending (e.g., ski/skies).               |
| q_stem        | c_any | c_any     | q_stem     | (Main Loop) Processing the root of the word.              |
| q_stem        | ε     | +N+SG     | q_final    | Word ends without a special marker (e.g., cat).           |
| q_s           | h     | sh        | q_sh       | 'sh' sibilant detected.                                   |
| q_s           | ε     | s+N+SG    | q_final    | It was a singular noun ending in 's' (e.g., bus).         |
| q_s           | ε     | +N+PL     | q_final    | It was a regular plural (e.g., bags).                     |
| q_c           | h     | h         | q_ch       | 'ch' sibilant detected.                                   |
| q_c           | c_any | c + c_any | q_stem     | False alarm; 'c' not followed by 'h'.                     |
| q_x,q_z,q_sh,q_ch | e   | ε         | q_e        | An 'e' appears after a sibilant (e.g., fox**e**...).      |
| q_x,q_z,q_sh,q_ch | ε   | +N+SG     | q_final    | Singular noun ending in a sibilant (e.g., fox).           |
| q_e           | s     | +N+PL     | q_final    | Plural '...es' confirmed (e.g., foxe**s**).                 |
| q_ies         | ε     | y+N+PL    | q_final    | An '...ies' ending confirmed (e.g., sk**ies**).           |
| q_ies         | ε     | i+N+SG    | q_final    | Word was a singular ending in 'i' (e.g., ski).            |
-----------------------------------------------------------------------
"""


def analyze_noun(w):
    """
    Analyzes a noun to find its root form and grammatical features.
    This function is the practical implementation of the FST designed above.
    """
    # --- Check for irregular nouns and special cases first ---
    irregular_nouns = {
        # Irregular Plurals
        "men": "man+N+PL", "women": "woman+N+PL", "children": "child+N+PL",
        "teeth": "tooth+N+PL", "feet": "foot+N+PL", "mice": "mouse+N+PL",
        "geese": "goose+N+PL", "people": "person+N+PL",
        # Singular forms of irregulars
        "man": "man+N+SG", "woman": "woman+N+SG", "child": "child+N+SG",
        "tooth": "tooth+N+SG", "foot": "foot+N+SG", "mouse": "mouse+N+SG",
        "goose": "goose+N+SG", "person": "person+N+SG",
        # Special singulars that look like plurals
        "dress": "dress+N+SG", "press": "press+N+SG",
        "boss": "boss+N+SG", "mess": "mess+N+SG",
    }
    if w in irregular_nouns:
        return irregular_nouns[w]

    # --- Logic for regular nouns ---
    if not isinstance(w, str) or not w.islower() or not w.isalpha():
        return "Invalid Word"

    if w.endswith('es'):
        if w.endswith(('shes', 'ches', 'xes', 'zes', 'ses')):
            return f"{w[:-2]}+N+PL"

    if w.endswith('ies') and len(w) > 3:
        return f"{w[:-3] + 'y'}+N+PL"

    if w.endswith('s'):
        sibilants = ('s', 'x', 'z')
        if w.endswith(('chs', 'shs')):
            return "Invalid Word"
        if len(w) > 1 and w[-2] in sibilants:
            return "Invalid Word"
        return f"{w[:-1]}+N+PL"

    return f"{w}+N+SG"


def process_file_to_csv(input_fn, output_fn):
    """
    Reads nouns from an input file, analyzes them, and saves the
    results to a CSV file.
    """
    results = []
    try:
        # This will ONLY read your existing file.
        with open(input_fn, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    analysis = analyze_noun(word)
                    results.append({"noun": word, "analysis": analysis})

        df = pd.DataFrame(results)
        df.to_csv(output_fn, index=False)

        print(f"Saved {len(df)} nouns to '{output_fn}'")

    except FileNotFoundError:
        print(f"Error: The file '{input_fn}' was not found.")
        print(f"Please make sure '{input_fn}' is in the same folder as the script.")
    except Exception as e:
        print(f"An error occurred: {e}")


# --- This will run the analysis on your existing file ---
process_file_to_csv("brown_nouns.txt", "noun_analysis.csv")

Saved 202793 nouns to 'noun_analysis.csv'


In [None]:
df = pd.read_csv('/content/noun_analysis.csv')
df

Unnamed: 0,noun,analysis
0,investigation,investigation+N+SG
1,primary,primary+N+SG
2,election,election+N+SG
3,evidence,evidence+N+SG
4,irregularities,irregularity+N+PL
...,...,...
202788,gauge,gauge+N+SG
202789,glance,glance+N+SG
202790,figure,figure+N+SG
202791,boucle,boucle+N+SG
