# Morphological Analysis in NLP using Python
This notebook demonstrates how to implement:
- Morphological Parsing
- Finite State Automata (FSA)
- Finite State Transducers (FST)
- Orthographic Rules

With examples and integration of all into a Morphological Analyzer.

In [1]:
# Install required package
!pip install nltk




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# # Download required resources
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [3]:
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')

In [4]:
data = "good morning 456 north america welcome to class 12345 www.google.com https://colab.research.google.com/drive/1isvr5XIR5xkumEhc6KwyKnTSBfzxsQeS#scrollTo=pXxFbGS3qGp2"
len(data.split(" "))

11

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kparo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
from nltk.tokenize import word_tokenize
words = word_tokenize(data)
len(words)

15

In [7]:
words

['good',
 'morning',
 '456',
 'north',
 'america',
 'welcome',
 'to',
 'class',
 '12345',
 'www.google.com',
 'https',
 ':',
 '//colab.research.google.com/drive/1isvr5XIR5xkumEhc6KwyKnTSBfzxsQeS',
 '#',
 'scrollTo=pXxFbGS3qGp2']

In [8]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\kparo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [9]:
nltk.pos_tag(words)

[('good', 'JJ'),
 ('morning', 'NN'),
 ('456', 'CD'),
 ('north', 'NN'),
 ('america', 'JJ'),
 ('welcome', 'NN'),
 ('to', 'TO'),
 ('class', 'NN'),
 ('12345', 'CD'),
 ('www.google.com', 'NN'),
 ('https', 'NN'),
 (':', ':'),
 ('//colab.research.google.com/drive/1isvr5XIR5xkumEhc6KwyKnTSBfzxsQeS', 'JJ'),
 ('#', '#'),
 ('scrollTo=pXxFbGS3qGp2', 'NN')]

In [10]:
#chunking = grouping
statememt1 = "The quick brown fox jumps over the lazy dog"
tokens = word_tokenize(statememt1)
tagged = nltk.pos_tag(tokens)
tagged

[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumps', 'VBZ'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

In [None]:
# s----np, vp,?-IF IT EXISTS, * ONE OR MORE OCCUR
expression = "NP: {<DT>?<JJ>*<NN>}"
from nltk.chunk import RegexpParser
cp = RegexpParser(expression)
result = cp.parse(tagged)
print(result)
result.draw() 

(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN))


In [None]:
import re
data1 = re.findall('(?:https?.|www.)\S+', data)
data1 = re.sub('(?:https?.|www.)\S+',"",  data)
data2 = re.sub('[^a-zA-Z]', " ", data1)
re.sub('\s+', " ", data2)

## ✅ 1. Morphological Parsing

In [13]:
# Function to demonstrate morphological parsing
def morphological_parsing(word):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    print(f"Word: {word}")
    print("Stemmed:", stemmer.stem(word))
    print("Lemmatized (verb):", lemmatizer.lemmatize(word, pos="v"))
    print("Lemmatized (noun):", lemmatizer.lemmatize(word, pos="n"))

# Test the function
morphological_parsing("running")

Word: running
Stemmed: run
Lemmatized (verb): run
Lemmatized (noun): running


## ✅ 2. Finite State Automata (FSA)

In [14]:
# Simple FSA using regular expressions
def fsa_plural_recognizer(word):
    import re
    pattern = re.compile(r"^[a-zA-Z]+s$")
    return bool(pattern.match(word))

# Test FSA
print(fsa_plural_recognizer("dogs"))  # True
print(fsa_plural_recognizer("dog"))   # False

True
False


## ✅ 3. Finite State Transducer (FST)

In [15]:
# FST to separate base word and suffix
def fst_morphological_transducer(word):
    if word.endswith("ing"):
        stem = word[:-3]
        if stem[-1] == stem[-2]:
            stem = stem[0:-1]
        return stem, "ing"
    return word, ""

# Test FST
base, suffix = fst_morphological_transducer("running")
print(f"Base: {base}, Suffix: {suffix}")

Base: run, Suffix: ing


## ✅ 4. Orthographic Rules

In [16]:
# Handle spelling changes when affixes are added
def apply_orthographic_rules(word):
    if word.endswith("ing"):
        base = word[:-3]
        if base + 'e' in ["make", "take", "love"]:
            return base + 'e', "ing"

    if word.endswith("ying"):
        return word[:-4] + "y", "ing"

    base = word[:-3]
    if len(base) >= 2 and base[-1] == base[-2]:
        return base[:-1], "ing"

    return word, ""

# Test orthographic rules
print(apply_orthographic_rules("making"))
print(apply_orthographic_rules("studying"))
print(apply_orthographic_rules("running"))

('make', 'ing')
('study', 'ing')
('run', 'ing')


## ✅ 5. Integrated Morphological Analyzer

In [17]:
# Combine FST and orthographic rules into one analyzer
def morphological_analyzer(word):
    base, suffix = fst_morphological_transducer(word)
    base, corrected_suffix = apply_orthographic_rules(word)

    return {
        "Original Word": word,
        "Base Form": base,
        "Suffix": corrected_suffix or suffix
    }

# Test the analyzer
print(morphological_analyzer("making"))
print(morphological_analyzer("studying"))
print(morphological_analyzer("running"))

{'Original Word': 'making', 'Base Form': 'make', 'Suffix': 'ing'}
{'Original Word': 'studying', 'Base Form': 'study', 'Suffix': 'ing'}
{'Original Word': 'running', 'Base Form': 'run', 'Suffix': 'ing'}
