In [37]:
'''
BI-GRAM MODEL

<Training Corpus>
The Arabian nights are very popular stories. These are the fairy tales of the east. 
The stories of the Arabian Nights are translated in many languages.

<Test Sentence>
The Arabian Nights are the fairy tales of the east.

[copy these sentences into the input fields]
'''

# import neccessary modules
from collections import defaultdict, Counter

def preprocess_and_generate_sentences(corpus):
    sentences = []
    tmp_string = ''
    for char in corpus:
        if char.isalpha() or char == ' ':
            tmp_string += char.lower()
        elif char in '.?;!':
            sentences.append(tmp_string.strip().split(' '))
            tmp_string = ''
    for sentence in sentences:
        sentence.insert(0, '<!>')
    return sentences

# TRAINING

training_corpus = input('Enter sentences/paragraph to TRAIN the Model\n>>> ')

train_sentences = preprocess_and_generate_sentences(training_corpus)
print('\nPREPROCESSED TRAINING INPUT\n\n', train_sentences)

train_size = len(train_sentences)
word_freqs = defaultdict(int)
for i in range(train_size):
    for j in range(len(train_sentences[i])):
        word_freqs[train_sentences[i][j]] += 1
        
next_words_table = defaultdict(Counter)
for i in range(train_size):
    for j in range(len(train_sentences[i]) - 1):
        next_words_table[train_sentences[i][j]][train_sentences[i][j + 1]] += 1

        
print('\nNEXT WORDS TABLE WITH FREQUENCIES\n')
for word, table in next_words_table.items():
    print(f'{word_freqs[word]:2d} times(s) [{word}] -> {dict(table)}')

# TESTING    

test_corpus = input('\nEnter a sentence to TEST the Model\n>>> ')

test_sentence = preprocess_and_generate_sentences(test_corpus)
print('\nPREPROCESSED TEST INPUT\n\n', test_sentence)

for i in range(len(test_sentence)):
    expression, probabilities, ans = [], [], 1
    print(f"\nPROBABILITY({' '.join(test_sentence[i])}) =")
    for j in range(len(test_sentence[i]) - 1):
        expression.append(f'P({test_sentence[i][j + 1]}|{test_sentence[i][j]})')
        p = next_words_table[test_sentence[i][j]][test_sentence[i][j + 1]] / word_freqs[test_sentence[i][j]]
        probabilities.append(round(p, 2))
        ans *= p

    print('=>', '.'.join(expression))
    print('=>', ' X '.join(map(str, probabilities)), '\n=>', round(ans, 4))

Enter sentences/paragraph to TRAIN the Model
>>> The Arabian nights are very popular stories. These are the fairy tales of the east.  The stories of the Arabian Nights are translated in many languages.

PREPROCESSED TRAINING INPUT

 [['<!>', 'the', 'arabian', 'nights', 'are', 'very', 'popular', 'stories'], ['<!>', 'these', 'are', 'the', 'fairy', 'tales', 'of', 'the', 'east'], ['<!>', 'the', 'stories', 'of', 'the', 'arabian', 'nights', 'are', 'translated', 'in', 'many', 'languages']]

NEXT WORDS TABLE WITH FREQUENCIES

 3 times(s) [<!>] -> {'the': 2, 'these': 1}
 5 times(s) [the] -> {'arabian': 2, 'fairy': 1, 'east': 1, 'stories': 1}
 2 times(s) [arabian] -> {'nights': 2}
 2 times(s) [nights] -> {'are': 2}
 3 times(s) [are] -> {'very': 1, 'the': 1, 'translated': 1}
 1 times(s) [very] -> {'popular': 1}
 1 times(s) [popular] -> {'stories': 1}
 1 times(s) [these] -> {'are': 1}
 1 times(s) [fairy] -> {'tales': 1}
 1 times(s) [tales] -> {'of': 1}
 2 times(s) [of] -> {'the': 2}
 2 times(s) [s