<a href="https://colab.research.google.com/github/ShovalBenjer/Natural_Language_Proccessing_NLP_Projects/blob/main/Trigram_Bigram_Unigram_English_Corpus_Wikipedia_484k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# (Keep all the code above sentence_probability the same as the previous version)

# --- 4. Calculate Sentence Probability (Refined to show Raw Probability) ---
def sentence_probability(sentence, unigram_probs, bigram_probs, trigram_probs, unigram_counts, bigram_counts, unigram_prefix_counts):
    """
    Calculates sentence log probability and raw probability using different models.
    Handles unseen n-grams using basic Add-1 smoothing.
    Prints results.
    """
    results_log = {'unigram': -float('inf'), 'bigram': -float('inf'), 'trigram': -float('inf')}
    results_raw = {'unigram': 0.0, 'bigram': 0.0, 'trigram': 0.0} # Initialize raw probs to 0

    if not sentence or not sentence.strip():
        print("Cannot calculate probability for empty sentence.")
        return

    # Use the same tokenization as in preprocessing
    words = re.findall(r'\b\w+\'?\w*\b', sentence.lower())
    if not words:
        print(f"Sentence '{sentence}' resulted in no tokens after processing.")
        return

    # Prepare token lists with appropriate boundary markers
    tokens_tri = [START_TOKEN, START_TOKEN] + words + [END_TOKEN]
    tokens_bi = [START_TOKEN] + words + [END_TOKEN] # Standard bigram uses one start token

    # --- Smoothing parameters (Add-1 style) ---
    vocab_size = len(unigram_counts) if unigram_counts else 1
    total_uni_count = sum(unigram_counts.values()) if unigram_counts else 0
    den_uni_smooth = total_uni_count + vocab_size
    min_uni_prob = 1.0 / den_uni_smooth if den_uni_smooth > 0 else 1e-10

    # --- Unigram (Simple Independence Model) ---
    log_prob_uni_current = 0.0
    if total_uni_count > 0:
        for word in words:
            word_tuple = (word,)
            prob = unigram_probs.get(word_tuple, min_uni_prob)
            if prob <= 0: prob = 1e-10
            log_prob_uni_current += math.log(prob)
        results_log['unigram'] = log_prob_uni_current
        # Calculate raw probability from log probability
        # Use try-except for potential overflow with exp, though unlikely here
        try:
             results_raw['unigram'] = math.exp(log_prob_uni_current)
        except OverflowError:
             results_raw['unigram'] = 0.0 # Treat as effectively zero if exp overflows

    # --- Bigram Model ---
    log_prob_bi_current = 0.0
    if len(tokens_bi) > 1 and unigram_prefix_counts and vocab_size > 0:
        min_bi_prob_fallback = 1e-10
        for i in range(len(tokens_bi) - 1):
            bg = tuple(tokens_bi[i:i+2])
            prob = bigram_probs.get(bg, 0.0)
            if prob == 0.0:
                prefix_token = bg[0]
                prefix_count = unigram_prefix_counts.get(prefix_token, 0)
                den_bi_smooth = prefix_count + vocab_size
                prob = 1.0 / den_bi_smooth if den_bi_smooth > 0 else min_bi_prob_fallback
            if prob <= 0: prob = 1e-10
            log_prob_bi_current += math.log(prob)
        results_log['bigram'] = log_prob_bi_current
        try:
            results_raw['bigram'] = math.exp(log_prob_bi_current)
        except OverflowError:
             results_raw['bigram'] = 0.0

    # --- Trigram Model ---
    log_prob_tri_current = 0.0
    if len(tokens_tri) > 2 and bigram_counts and vocab_size > 0:
        min_tri_prob_fallback = 1e-10
        for i in range(len(tokens_tri) - 2):
            tg = tuple(tokens_tri[i:i+3])
            prob = trigram_probs.get(tg, 0.0)
            if prob == 0.0:
                prefix_bigram = tg[:-1]
                prefix_count = bigram_counts.get(prefix_bigram, 0)
                den_tri_smooth = prefix_count + vocab_size
                prob = 1.0 / den_tri_smooth if den_tri_smooth > 0 else min_tri_prob_fallback
            if prob <= 0: prob = 1e-10
            log_prob_tri_current += math.log(prob)
        results_log['trigram'] = log_prob_tri_current
        try:
             results_raw['trigram'] = math.exp(log_prob_tri_current)
        except OverflowError:
             results_raw['trigram'] = 0.0


    # --- Print the results ---
    print(f"\n--- Probabilities for sentence: '{sentence}' ---")
    print(f"  Model      | Log Probability | Raw Probability (Approx.)")
    print(f"-------------|-----------------|---------------------------")
    print(f"  Unigram    | {results_log['unigram']:15.4f} | {results_raw['unigram']:.6e}")
    print(f"  Bigram     | {results_log['bigram']:15.4f} | {results_raw['bigram']:.6e}")
    print(f"  Trigram    | {results_log['trigram']:15.4f} | {results_raw['trigram']:.6e}")
    print(f"  (Using basic Add-1 smoothing for unseen n-grams)")
    print(f"  (Note: Raw probabilities are often extremely small; log probabilities are standard for comparison)")

# --- Main Execution ---
# (Keep the main execution block the same, calling this updated sentence_probability function)
# ... (rest of the script from previous answer) ...

print("--- Starting N-gram Model Building ---")

# 1. Preprocess Text
tokens = preprocess_text(file_path)

if tokens:
    # 2. Count N-grams
    print("\n--- Counting N-grams ---")
    unigram_counts = build_ngram_counts(tokens, 1)
    bigram_counts = build_ngram_counts(tokens, 2)
    trigram_counts = build_ngram_counts(tokens, 3)

    # Check if counts were successful
    if not unigram_counts or not bigram_counts or not trigram_counts:
        print("\nError: Failed to generate n-gram counts. Cannot proceed.")
    else:
        print(f"Total Unigrams (tokens): {sum(unigram_counts.values())}")
        print(f"Unique Unigrams: {len(unigram_counts)}")
        print(f"Total Bigrams: {sum(bigram_counts.values())}")
        print(f"Unique Bigrams: {len(bigram_counts)}")
        print(f"Total Trigrams: {sum(trigram_counts.values())}")
        print(f"Unique Trigrams: {len(trigram_counts)}")

        # --- Display Sample Counts ---
        print("\n--- Sample N-gram Counts ---")
        # Ensure counts are not empty before calling most_common
        if unigram_counts: print("Top 5 Unigrams:", unigram_counts.most_common(5))
        if bigram_counts: print("Top 5 Bigrams:", bigram_counts.most_common(5))
        if trigram_counts: print("Top 5 Trigrams:", trigram_counts.most_common(5))

        # 3. Calculate Probabilities (MLE)
        print("\n--- Calculating Probabilities (MLE) ---")

        total_token_count_for_unigrams = sum(unigram_counts.values())
        unigram_prefix_counts_for_bigrams = Counter(token for token in tokens)

        unigram_probs = calculate_ngram_probabilities(unigram_counts, None, total_token_count_for_unigrams, n=1)
        bigram_probs = calculate_ngram_probabilities(bigram_counts, unigram_prefix_counts_for_bigrams, None, n=2)
        trigram_probs = calculate_ngram_probabilities(trigram_counts, bigram_counts, None, n=3)

        # Check if probability dictionaries were created
        if not unigram_probs or not bigram_probs or not trigram_probs:
             print("\nError: Failed to calculate probabilities. Cannot proceed.")
        else:
            # --- Display Sample Probabilities ---
            print("\n--- Sample N-gram Probabilities (MLE) ---")
            # Unigrams
            print("Unigram Probabilities:")
            if unigram_counts:
              sample_unigrams = [ug for ug, count in unigram_counts.most_common(5)]
              for ug in sample_unigrams:
                  # Use .get to avoid KeyError if prob somehow missing
                  print(f"  P({ug[0]}) = {unigram_probs.get(ug, 0.0):.6f}")
            else: print("  No unigrams to calculate probabilities for.")

            # Bigrams
            print("\nBigram Probabilities:")
            if bigram_counts:
              sample_bigrams = [bg for bg, count in bigram_counts.most_common(5)]
              for bg in sample_bigrams:
                  prefix_uni = bg[0]
                  print(f"  P({bg[1]} | {prefix_uni}) = {bigram_probs.get(bg, 0.0):.6f}  (Count({bg})={bigram_counts.get(bg,0)}, Count({prefix_uni})={unigram_prefix_counts_for_bigrams.get(prefix_uni, 0)})")
            else: print("  No bigrams to calculate probabilities for.")


            # Trigrams
            print("\nTrigram Probabilities:")
            if trigram_counts:
              sample_trigrams = [tg for tg, count in trigram_counts.most_common(5)]
              for tg in sample_trigrams:
                  prefix_bi = tg[:-1]
                  print(f"  P({tg[2]} | {tg[0]}, {tg[1]}) = {trigram_probs.get(tg, 0.0):.6f}  (Count({tg})={trigram_counts.get(tg,0)}, Count({prefix_bi})={bigram_counts.get(prefix_bi, 0)})")
            else: print("  No trigrams to calculate probabilities for.")

            print("\n--- Model Building Complete ---")

            # --- Calculate Probability for Example Sentences ---
            print("\n--- Calculating Probabilities for Example Sentences ---")
            test_sentence_1 = "the government last july called the energy sector" # Likely exists
            test_sentence_2 = "this is a completely random sentence" # Likely has unseen n-grams
            sentence_probability(test_sentence_1, unigram_probs, bigram_probs, trigram_probs, unigram_counts, bigram_counts, unigram_prefix_counts_for_bigrams)
            sentence_probability(test_sentence_2, unigram_probs, bigram_probs, trigram_probs, unigram_counts, bigram_counts, unigram_prefix_counts_for_bigrams)


            # --- USER INPUT LOOP ---
            print("\n--- Calculate Probability for User Input ---")
            while True:
                try:
                    user_sentence = input("Enter a sentence to calculate its probability (or press Enter to quit): ")
                    if not user_sentence.strip():
                        print("Exiting.")
                        break
                    # Call the function - it now handles printing internally
                    sentence_probability(user_sentence,
                                         unigram_probs,
                                         bigram_probs,
                                         trigram_probs,
                                         unigram_counts,
                                         bigram_counts,
                                         unigram_prefix_counts_for_bigrams)
                except EOFError:
                     print("\nExiting due to EOF.")
                     break
                except KeyboardInterrupt:
                     print("\nExiting due to user interrupt.")
                     break
                except Exception as e:
                     print(f"\nAn error occurred during input/calculation: {e}")
                     # Optional: break or continue
                     # break

# --- Fallback if issues occurred earlier ---
else:
    print("\nCould not process the file or build models. Input loop skipped.")

--- Starting N-gram Model Building ---
Processing 112 lines...
Found 3281 non-empty sentences.
Total tokens (including boundary markers): 75779
Sample tokens: ['<s>', '<s>', 'the', 'government', 'last', 'july', 'called', 'the', 'energy', 'sector', 'debt', 'situation', 'a', 'state', 'of', 'emergency', '</s>', '<s>', '<s>', 'this', 'was', 'during', 'the', 'midyear', 'budget'] ... ['your', 'hands', 'with', 'soap', 'and', 'water', 'cover', 'your', 'mouth', 'when', 'you', 'cough', 'or', 'sneeze', 'stay', 'home', 'from', 'work', 'until', 'you', 've', 'gone', 'one', 'day', '</s>']

--- Counting N-grams ---
Total Unigrams (tokens): 75779
Unique Unigrams: 9692
Total Bigrams: 75778
Unique Bigrams: 42652
Total Trigrams: 75777
Unique Trigrams: 60325

--- Sample N-gram Counts ---
Top 5 Unigrams: [(('<s>',), 6562), (('the',), 4050), (('</s>',), 3281), (('to',), 1877), (('of',), 1775)]
Top 5 Bigrams: [(('<s>', '<s>'), 3281), (('</s>', '<s>'), 3280), (('of', 'the'), 442), (('<s>', 'the'), 418), (('in'