In [2]:
# Step 1: Import NLTK and define the grammar
import nltk
from nltk import PCFG

# Define the grammar using PCFG.fromstring (modified to include "cat", "chased", "dog")
grammar = PCFG.fromstring("""
S -> NP VP [1.0]
VP -> V NP [0.7] | VP PP [0.3]
NP -> Det N [0.5] | NP PP [0.2] | 'John' [0.3]
PP -> P NP [1.0]
V -> 'saw' [0.2] | 'ate' [0.4] | 'chased' [0.4]
Det -> 'the' [0.6] | 'a' [0.4]
N -> 'man' [0.3] | 'telescope' [0.3] | 'cat' [0.2] | 'dog' [0.2]
P -> 'with' [0.4] | 'in' [0.6]
""")

# Step 2: Define the CYK parsing function
def cyk_parse_with_probability(grammar, sentence):
    # Split the sentence into words
    n = len(sentence)
    # Initialize the table with production probabilities
    table = [[[] for _ in range(n)] for _ in range(n)]
    # Fill the table with production probabilities
    for i in range(n):
        for prod in grammar.productions():
            if prod.rhs() == (sentence[i],):  # Corrected comparison
                table[i][i].append((prod.lhs(), prod.prob()))

    # Fill the table for longer substrings
    for length in range(2, n + 1):  # Length of substring: 2, 3, ..., n
        for i in range(n - length + 1):  # Start index of substring
            j = i + length - 1  # End index of substring
            for k in range(i, j):  # Split point
                for left, left_prob in table[i][k]:
                    for right, right_prob in table[k + 1][j]:
                        for prod in grammar.productions():
                            # Check if the production matches the left and right non-terminals
                            if len(prod.rhs()) == 2 and str(prod.rhs()[0]) == str(left) and str(prod.rhs()[1]) == str(right):
                                prod_prob = left_prob * right_prob * prod.prob()
                                # Add the non-terminal and its probability to the table
                                already_in_table = False
                                for nt, prob in table[i][j]:
                                    if str(nt) == str(prod.lhs()):
                                        already_in_table = True
                                        if prod_prob > prob:
                                            table[i][j].remove((nt, prob))
                                            table[i][j].append((prod.lhs(), prod_prob))
                                if not already_in_table:
                                    table[i][j].append((prod.lhs(), prod_prob))

    # Check if the start symbol is in the top-right cell
    for lhs, lhs_prob in table[0][n-1]:
        if str(lhs) == str(grammar.start()):
            return lhs_prob

    return 0.0

# Step 3: Parse the sentence and calculate the probability
sentence = "the cat chased the dog".split()
probability = cyk_parse_with_probability(grammar, sentence)
if probability > 0:
    print(f'The sentence "{sentence}" is grammatically correct with a probability of {probability:.10f}')
else:
    print(f'The sentence "{sentence}" is NOT grammatically correct.')

The sentence "['the', 'cat', 'chased', 'the', 'dog']" is grammatically correct with a probability of 0.0010080000
