1. Pure Symbolic Approach – English
(Adjectives are only prefixed to nouns.)

In [2]:
import nltk
from nltk import CFG, ChartParser

# Define a simple CFG for English without inline comments.
english_grammar = CFG.fromstring("""
S -> NP VP
NP -> DET N
NP -> ADJ NP
VP -> V NP
VP -> V NP ADV
DET -> 'the' | 'a'
ADJ -> 'big' | 'small'
N -> 'dog' | 'cat'
V -> 'runs' | 'jumps'
ADV -> 'quickly'
""")

# Sample sentence (tokenized)
sentence = "the big dog runs quickly".split()

# Create the parser and parse the sentence
parser = ChartParser(english_grammar)
parse_trees = list(parser.parse(sentence))

if parse_trees:
    print("English Parse Tree:")
    parse_trees[0].pretty_print()
else:
    print("No valid parse found for the English sentence.")


No valid parse found for the English sentence.


2. Pure Symbolic Approach – French
(Adjectives are only suffixed to nouns.)

In [4]:
import nltk
from nltk import CFG, ChartParser

# Define a CFG for French with adjectives following the noun.
french_grammar = CFG.fromstring("""
S -> NP VP
NP -> DET N
NP -> NP ADJ
VP -> V NP
VP -> V NP ADV
DET -> 'le' | 'la'
ADJ -> 'grand' | 'petit'
N -> 'chien' | 'chat'
V -> 'court' | 'saute'
ADV -> 'rapidement'
""")

# Sample French sentence (tokenized)
sentence_fr = "le chien grand court rapidement".split()

# Parse the French sentence
parser_fr = ChartParser(french_grammar)
parse_trees_fr = list(parser_fr.parse(sentence_fr))

if parse_trees_fr:
    print("French Parse Tree:")
    parse_trees_fr[0].pretty_print()
else:
    print("No valid parse found for the French sentence.")


No valid parse found for the French sentence.


3. Pure Symbolic Approach – German
(Adjectives are only prefixed to nouns.)

In [6]:
import nltk
from nltk import CFG, ChartParser

# Define a CFG for German with adjectives preceding nouns.
german_grammar = CFG.fromstring("""
S -> NP VP
NP -> DET N
NP -> ADJ NP
VP -> V NP
VP -> V NP ADV
DET -> 'der' | 'die'
ADJ -> 'groß' | 'klein'
N -> 'hund' | 'katze'
V -> 'läuft' | 'springt'
ADV -> 'schnell'
""")

# Sample German sentence (tokenized)
sentence_de = "der groß hund läuft schnell".split()

# Parse the German sentence
parser_de = ChartParser(german_grammar)
parse_trees_de = list(parser_de.parse(sentence_de))

if parse_trees_de:
    print("German Parse Tree:")
    parse_trees_de[0].pretty_print()
else:
    print("No valid parse found for the German sentence.")


No valid parse found for the German sentence.


4. Pure Symbolic Approach – Italian
(Adjectives are only suffixed to nouns.)

In [7]:
import nltk
from nltk import CFG, ChartParser

# Define a CFG for Italian with adjectives following the noun.
italian_grammar = CFG.fromstring("""
S -> NP VP
NP -> DET N
NP -> NP ADJ
VP -> V NP
VP -> V NP ADV
DET -> 'il' | 'la'
ADJ -> 'grande' | 'piccolo'
N -> 'cane' | 'gatto'
V -> 'corre' | 'salta'
ADV -> 'velocemente'
""")

# Sample Italian sentence (tokenized)
sentence_it = "il cane grande corre velocemente".split()

# Parse the Italian sentence
parser_it = ChartParser(italian_grammar)
parse_trees_it = list(parser_it.parse(sentence_it))

if parse_trees_it:
    print("Italian Parse Tree:")
    parse_trees_it[0].pretty_print()
else:
    print("No valid parse found for the Italian sentence.")


No valid parse found for the Italian sentence.


In [None]:
5. Pure ML Approach – One-Step Markov Chain for POS Tagging
(Training on the Brown corpus to forecast the next tag.)

In [9]:
import nltk
from nltk.corpus import brown
from nltk import bigrams, ConditionalFreqDist

# Download necessary corpora (if not already downloaded)
nltk.download('brown')
nltk.download('universal_tagset')

# Use the Brown corpus with a universal tagset.
tagged_words = brown.tagged_words(tagset='universal')
# Extract only the POS tags
tags = [tag for (_, tag) in tagged_words]

# Build tag bigrams and a conditional frequency distribution
tag_bigrams = list(bigrams(tags))
cfd = ConditionalFreqDist(tag_bigrams)

def predict_next_tag(current_tag):
    """
    Given a current POS tag, predict the most likely next tag.
    """
    if current_tag in cfd:
        return cfd[current_tag].max()
    else:
        return None

def predict_tag_sequence(sentence_tokens, start_tag='DET'):
    """
    Predict a sequence of POS tags for the tokenized sentence using a one-step Markov chain.
    The start_tag is provided (here assumed as 'DET' for demonstration).
    """
    predicted_tags = [start_tag]
    current_tag = start_tag
    # For each subsequent token, use the most common successor tag.
    for _ in range(1, len(sentence_tokens)):
        next_tag = predict_next_tag(current_tag)
        if not next_tag:
            next_tag = start_tag
        predicted_tags.append(next_tag)
        current_tag = next_tag
    return predicted_tags

# Sample sentence (tokenized) for POS prediction.
sentence_ml = "the quick brown fox jumps over the lazy dog".split()
predicted_sequence = predict_tag_sequence(sentence_ml, start_tag='DET')

print("Sentence:", sentence_ml)
print("Predicted POS Tag Sequence:", predicted_sequence)


[nltk_data] Downloading package brown to
[nltk_data]     /Users/majidtavakoli/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/majidtavakoli/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Sentence: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Predicted POS Tag Sequence: ['DET', 'NOUN', '.', 'DET', 'NOUN', '.', 'DET', 'NOUN', '.']
