In [1]:
def analyze_text_corpus(corpus):

  words = [word.lower() for word in corpus.split() if word.isalpha()]
  #unigrams
  unigrams = {}
  for word in words:
    unigrams[word] = unigrams.get(word, 0) + 1

  #Bigrams
  bigrams = {}
  for i in range(len(words) - 1):
    first_word = words[i]
    second_word = words[i + 1]
    bigram = (first_word, second_word)
    bigrams[bigram] = bigrams.get(bigram, 0) + 1

  #Trigrams
  trigrams = {}
  for i in range(len(words) - 2):
    first_word = words[i]
    second_word = words[i + 1]
    third_word = words[i + 2]
    trigram = (first_word, second_word, third_word)
    trigrams[trigram] = trigrams.get(trigram, 0) + 1

  #Bigram Probabilities
  bigram_probs = {}
  total_words = len(words) - 1  
  for bigram, count in bigrams.items():
    first_word, second_word = bigram
    first_word_count = unigrams.get(first_word, 0)
    bigram_probs[bigram] = (count + 1) / (first_word_count + len(unigrams))  
  #Next Word Prediction
  def next_word_prediction(bigram):
    if bigram not in bigram_probs:
      return None 

    next_word_candidates = [(word, prob) for word, prob in bigram_probs.items() if word[0] == bigram[1]]
    return max(next_word_candidates, key=lambda x: x[1])[0]

  return {
      "unigrams": unigrams,
      "bigrams": bigrams,
      "trigrams": trigrams,
      "bigram_probs": bigram_probs,
      "next_word_prediction": next_word_prediction,
  }


corpus = "This is a sample text corpus to analyze for n-grams and next word prediction."
results = analyze_text_corpus(corpus)

print("1. Unigrams:")
for word, count in results["unigrams"].items():
  print(f"{word}: {count}")

print("\n2. Bigrams:")
for bigram, count in results["bigrams"].items():
  print(f"{bigram}: {count}")

print("\n3. Trigrams:")
for trigram, count in results["trigrams"].items():
  print(f"{trigram}: {count}")

print("\n4. Bigram Probabilities:")
for bigram, prob in results["bigram_probs"].items():
  print(f"{bigram}: {prob:.4f}") 

bigram = ("this", "is")
next_word = results["next_word_prediction"](bigram)
print("\n5. Next word prediction for '{bigram}': {next_word}")

1. Unigrams:
this: 1
is: 1
a: 1
sample: 1
text: 1
corpus: 1
to: 1
analyze: 1
for: 1
and: 1
next: 1
word: 1

2. Bigrams:
('this', 'is'): 1
('is', 'a'): 1
('a', 'sample'): 1
('sample', 'text'): 1
('text', 'corpus'): 1
('corpus', 'to'): 1
('to', 'analyze'): 1
('analyze', 'for'): 1
('for', 'and'): 1
('and', 'next'): 1
('next', 'word'): 1

3. Trigrams:
('this', 'is', 'a'): 1
('is', 'a', 'sample'): 1
('a', 'sample', 'text'): 1
('sample', 'text', 'corpus'): 1
('text', 'corpus', 'to'): 1
('corpus', 'to', 'analyze'): 1
('to', 'analyze', 'for'): 1
('analyze', 'for', 'and'): 1
('for', 'and', 'next'): 1
('and', 'next', 'word'): 1

4. Bigram Probabilities:
('this', 'is'): 0.1538
('is', 'a'): 0.1538
('a', 'sample'): 0.1538
('sample', 'text'): 0.1538
('text', 'corpus'): 0.1538
('corpus', 'to'): 0.1538
('to', 'analyze'): 0.1538
('analyze', 'for'): 0.1538
('for', 'and'): 0.1538
('and', 'next'): 0.1538
('next', 'word'): 0.1538

5. Next word prediction for '{bigram}': {next_word}
