In [None]:
import re

# Load text data
with open('Pride.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Preprocess the text
def preprocess_text(text):
    # Remove non-alphanumeric characters (except spaces and hyphens for words like "can't" or "mother-in-law")
    text = re.sub(r'[^a-zA-Z0-9\s\-\']', '', text)
    # Convert to lowercase
    text = text.lower()
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

cleaned_text = preprocess_text(text)
print(cleaned_text[:500])  # Print the first 500 characters for inspection


pride and prejudice by jane austen ebd e-booksdirectorycom chapter 1 it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife however little known the feelings or views of such a man may be on his first entering a neighbourhood this truth is so well fixed in the minds of the surrounding families that he is considered the rightful property of some one or other of their daughters my dear mr bennet said his lady to him one day have you heard


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download('punkt')  # Ensure you have the required tokenizer data

# Tokenize into sentences
sentences = sent_tokenize(cleaned_text)

# Tokenize each sentence into words
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
print(tokenized_sentences[:3])  # Print the first 3 tokenized sentences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[['pride', 'and', 'prejudice', 'by', 'jane', 'austen', 'ebd', 'e-booksdirectorycom', 'chapter', '1', 'it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife', 'however', 'little', 'known', 'the', 'feelings', 'or', 'views', 'of', 'such', 'a', 'man', 'may', 'be', 'on', 'his', 'first', 'entering', 'a', 'neighbourhood', 'this', 'truth', 'is', 'so', 'well', 'fixed', 'in', 'the', 'minds', 'of', 'the', 'surrounding', 'families', 'that', 'he', 'is', 'considered', 'the', 'rightful', 'property', 'of', 'some', 'one', 'or', 'other', 'of', 'their', 'daughters', 'my', 'dear', 'mr', 'bennet', 'said', 'his', 'lady', 'to', 'him', 'one', 'day', 'have', 'you', 'heard', 'that', 'netherfield', 'park', 'is', 'let', 'at', 'last', 'mr', 'bennet', 'replied', 'that', 'he', 'had', 'not', 'but', 'it', 'is', 'returned', 'she', 'for', 'mrs', 'long', 'has', 'just', 'been', 'here', 'and', 'she

In [None]:
from collections import defaultdict, Counter

def generate_ngrams(tokenized_sentences, n):
    ngrams = []
    for sentence in tokenized_sentences:
        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i + n])
            ngrams.append(ngram)
    return ngrams

# Generate bigrams (you can change the value to 3 for trigrams, etc.)
n = 2
bigrams = generate_ngrams(tokenized_sentences, n)

# Calculate the frequency of each bigram
bigram_freq = Counter(bigrams)

# Calculate conditional probabilities
bigram_model = defaultdict(dict)
for (w1, w2), freq in bigram_freq.items():
    if w1 not in bigram_model:
        bigram_model[w1] = {}
    bigram_model[w1][w2] = freq

# Normalize probabilities
for w1 in bigram_model:
    total_count = sum(bigram_model[w1].values())
    for w2 in bigram_model[w1]:
        bigram_model[w1][w2] /= total_count

print(bigram_model)  # Print the bigram model with probabilities


defaultdict(<class 'dict'>, {'pride': {'and': 0.23404255319148937, 'said': 0.02127659574468085, 'often': 0.02127659574468085, 'if': 0.02127659574468085, 'observed': 0.02127659574468085, 'are': 0.02127659574468085, 'relates': 0.02127659574468085, 'yes': 0.02127659574468085, '--': 0.0425531914893617, 'will': 0.02127659574468085, 'you': 0.02127659574468085, 'of': 0.0425531914893617, 'had': 0.0425531914893617, 'can': 0.02127659574468085, 'as': 0.02127659574468085, 'which': 0.02127659574468085, 'never': 0.02127659574468085, 'for': 0.02127659574468085, 'was': 0.02127659574468085, 'elizabeth': 0.02127659574468085, 'beyond': 0.02127659574468085, 'she': 0.0425531914893617, 'his': 0.0425531914893617, 'been': 0.02127659574468085, 'in': 0.0425531914893617, 'or': 0.02127659574468085, 'pride': 0.02127659574468085, 'he': 0.0425531914893617, 'exciting': 0.02127659574468085, 'must': 0.02127659574468085}, 'and': {'prejudice': 0.0002800336040324839, 'she': 0.0313637636516382, 'i': 0.034724166900028004, '

In [None]:
def predict_next_word(word, ngram_model, top_k=3):
    # Get possible next words and their probabilities
    if word in ngram_model:
        next_word_probs = ngram_model[word]
        sorted_predictions = sorted(next_word_probs.items(), key=lambda x: x[1], reverse=True)
        return [word for word, prob in sorted_predictions[:top_k]]
    else:
        return []

# Example prediction
input_word = 'you'  # Example input
predictions = predict_next_word(input_word, bigram_model)
print(f"Predictions for '{input_word}': {predictions}")


Predictions for 'you': ['are', 'have', 'will']


In [None]:
from collections import defaultdict
import random

emission_probabilities = defaultdict(lambda: defaultdict(int))
# Define edit distance functions to generate possible spelling errors
def generate_deletions(word):
    return [word[:i] + word[i+1:] for i in range(len(word))]

def generate_insertions(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    return [word[:i] + c + word[i:] for i in range(len(word) + 1) for c in letters]

def generate_substitutions(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    return [word[:i] + c + word[i+1:] for i in range(len(word)) for c in letters if c != word[i]]

def generate_transpositions(word):
    return [word[:i] + word[i+1] + word[i] + word[i+2:] for i in range(len(word) - 1)]

def generate_possible_misspellings(word):
    # Combine all edit types for potential misspellings
    return set(generate_deletions(word) + generate_insertions(word) +
               generate_substitutions(word) + generate_transpositions(word))

# Example usage
print(generate_possible_misspellings("example"))


{'xexample', 'exampleq', 'examplu', 'exafmple', 'exaple', 'exzample', 'examplae', 'vexample', 'examkple', 'exadple', 'exarmple', 'exazmple', 'examvple', 'examplr', 'exampue', 'examplef', 'examlle', 'examplne', 'exabple', 'eeample', 'exaople', 'rexample', 'exampbe', 'exampte', 'exapple', 'exwample', 'examplze', 'exrmple', 'exnmple', 'examwle', 'exaimple', 'eoample', 'exampre', 'examrle', 'exomple', 'ecxample', 'exampole', 'examplh', 'exampnle', 'uxample', 'exampple', 'exaxple', 'examele', 'zxample', 'examgle', 'exampde', 'examule', 'aexample', 'exabmple', 'texample', 'xample', 'erample', 'exampe', 'exameple', 'examplqe', 'examplwe', 'exsmple', 'exampleh', 'examphe', 'exumple', 'examole', 'exaomple', 'exaample', 'examplec', 'exahple', 'examiple', 'exampleu', 'exvample', 'examplem', 'ebample', 'exampae', 'fxample', 'extmple', 'expmple', 'exadmple', 'yexample', 'examplb', 'examplke', 'eyample', 'examplp', 'kexample', 'exagmple', 'epxample', 'examcle', 'exvmple', 'exajple', 'eximple', 'exkm

In [None]:
# Transition probabilities (using the bigram model)
transition_probabilities = bigram_model

# Emission probabilities
emission_probabilities = defaultdict(lambda: defaultdict(float))

# Let's add possible spelling mistakes for each word
for sentence in tokenized_sentences:
    for word in sentence:
        # Generate possible misspellings
        misspellings = generate_possible_misspellings(word)
        for misspelled_word in misspellings:
            emission_probabilities[word][misspelled_word] += 1

# Normalize emission probabilities
for true_word in emission_probabilities:
    total_misspellings = sum(emission_probabilities[true_word].values())
    for misspelled_word in emission_probabilities[true_word]:
        emission_probabilities[true_word][misspelled_word] /= total_misspellings

# Print a subset of the emission probabilities (first 5 true words and their misspellings)
print("Subset of emission probabilities (first 5 words and their misspellings):")
for true_word, misspellings in list(emission_probabilities.items())[:5]:  # Limit to 5 words
    print(f"{true_word}: {dict(list(misspellings.items())[:5])}")  # Print up to 5 misspellings per word


Subset of emission probabilities (first 5 words and their misspellings):
pride: {'pribe': 0.0035087719298245615, 'pridne': 0.0035087719298245615, 'prido': 0.0035087719298245615, 'prides': 0.0035087719298245615, 'prkide': 0.0035087719298245615}
and: {'anbd': 0.0055248618784530384, 'anh': 0.0055248618784530384, 'ant': 0.0055248618784530384, 'anod': 0.0055248618784530384, 'anwd': 0.0055248618784530384}
prejudice: {'qprejudice': 0.002028397565922921, 'prejuvice': 0.002028397565922921, 'prejudfice': 0.002028397565922921, 'perjudice': 0.002028397565922921, 'prcjudice': 0.002028397565922921}
by: {'byd': 0.007751937984496124, 'byo': 0.007751937984496124, 'bq': 0.007751937984496124, 'lby': 0.007751937984496124, 'eby': 0.007751937984496124}
jane: {'janel': 0.004291845493562232, 'tane': 0.004291845493562232, 'sane': 0.004291845493562232, 'jacne': 0.004291845493562232, 'jarne': 0.004291845493562232}


In [None]:
import numpy as np

def viterbi(observed_sequence, states, transition_probs, emission_probs, start_prob=1.0):
    n = len(observed_sequence)
    dp = np.zeros((len(states), n))  # DP table
    path = np.zeros((len(states), n), dtype=int)  # For backtracking the best path

    # Initialize base case
    state_index = {state: i for i, state in enumerate(states)}
    for s in states:
        if observed_sequence[0] in emission_probs[s]:
            dp[state_index[s], 0] = start_prob * emission_probs[s][observed_sequence[0]]

    # Fill in the DP table
    for t in range(1, n):
        for s in states:
            max_prob, max_state = 0, 0
            for prev_s in states:
                prob = dp[state_index[prev_s], t-1] * transition_probs[prev_s].get(s, 0) * emission_probs[s].get(observed_sequence[t], 0)
                if prob > max_prob:
                    max_prob, max_state = prob, state_index[prev_s]
            dp[state_index[s], t] = max_prob
            path[state_index[s], t] = max_state

    # Backtrack to find the best path
    best_path = []
    max_prob, last_state = max((dp[i, n-1], i) for i in range(len(states)))
    best_path.append(states[last_state])

    for t in range(n - 1, 0, -1):
        last_state = path[last_state, t]
        best_path.append(states[last_state])

    best_path.reverse()
    return best_path

# Example usage
states = list(emission_probabilities.keys())  # All possible true words
observed_sequence = ['studiaus']  # Test with a misspelled word
corrected_sequence = viterbi(observed_sequence, states, transition_probabilities, emission_probabilities)
print("Corrected Sequence:", corrected_sequence)


Corrected Sequence: ['studious']


In [None]:
import pickle

# Assuming emission_probabilities is a defaultdict
# Convert the defaultdict to a regular dict before saving
emission_probabilities_dict = {key: dict(value) for key, value in emission_probabilities.items()}

# Save the bigram model
with open('bigram_model.pkl', 'wb') as f:
    pickle.dump(bigram_model, f)

# Save the emission probabilities (now a regular dict)
with open('emission_probabilities.pkl', 'wb') as f:
    pickle.dump(emission_probabilities_dict, f)

print("Models saved successfully!")


Models saved successfully!
