Using n-gram model to predict the next possibility of word or sequence of characters based on probability estimation
Step 1: Generating random 5000 phrases (the corrected ones).

1.   Generating random 5000 phrases (the corrected ones).
2.   Creating a n-gram model (bi or tri) for correct phrases.
3.   Correct the noisy phrases using n-gram model.
4.   Evaluate the accuracy of the model based on dataset.


In [62]:
import random

# Domain-specific data
quantities = [str(i) for i in range(1, 11)]
units = ["kg", "g"]
vegetables = ["tomato", "potato", "onion", "carrot", "cabbage"]
prices = [f"{i} Rs" for i in range(10, 501, 10)]  # Prices from 10 Rs to 500 Rs

# Generate 5000 correct phrases
def generate_correct_phrases(n=5000):
    correct_phrases = []
    for _ in range(n):
        quantity = random.choice(quantities)
        unit = random.choice(units)
        vegetable = random.choice(vegetables)
        price = random.choice(prices)
        correct_phrases.append(f"{quantity} {unit} {vegetable} {price}")
    return correct_phrases

correct_phrases = generate_correct_phrases()


In [63]:
# Introduce noise into phrases
def introduce_noise(phrase):
    noise_types = ["typo", "extra", "missing_space"]
    noise_type = random.choice(noise_types)
    words = phrase.split()

    if noise_type == "typo":
        idx = random.randint(0, len(words) - 1)  # Pick a random word index
        word = words[idx]
        char_idx = random.randint(0, len(word) - 1)
        typo_char = chr(random.randint(97, 122))  # Random lowercase letter
        word = word[:char_idx] + typo_char + word[char_idx + 1:]
        words[idx] = word  # Replace the word directly by index

    elif noise_type == "extra":
        idx = random.randint(0, len(words) - 1)  # Pick a random word index
        word = words[idx]
        extra_char = chr(random.randint(97, 122))
        position = random.randint(0, len(word))
        word = word[:position] + extra_char + word[position:]
        words[idx] = word  # Replace the word directly by index

    elif noise_type == "missing_space" and len(words) > 1:
        idx = random.randint(0, len(words) - 2)  # Pick a random pair of words
        words[idx] = words[idx] + words[idx + 1]
        del words[idx + 1]

    return " ".join(words)

# Create noisy dataset
noisy_phrases = [introduce_noise(phrase) for phrase in correct_phrases]



In [64]:
import re
from collections import Counter

# Function to generate n-grams from a word
def generate_ngrams(word, n=2):
    word = ' ' * (n - 1) + word + ' ' * (n - 1)  # Padding to handle n-grams at the edges of words
    return [word[i:i + n] for i in range(len(word) - n + 1)]

# Function to get the best correction based on n-grams
def correct_word_ngram(word, dictionary, n=2):
    word_ngrams = Counter(generate_ngrams(word, n))  # Get n-grams for the noisy word
    best_match = None
    best_overlap = 0

    for candidate in dictionary:
        candidate_ngrams = Counter(generate_ngrams(candidate, n))  # Get n-grams for the candidate (correct word)

        # Calculate overlap of n-grams between the noisy word and candidate word
        overlap = sum((word_ngrams & candidate_ngrams).values())

        # Track the word with the most overlap
        if overlap > best_overlap:
            best_overlap = overlap
            best_match = candidate

    return best_match if best_match else word

# Correct a phrase
def correct_phrase_ngram(phrase, dictionary, n=2):
    return ' '.join(correct_word_ngram(word, dictionary, n) for word in phrase.split())

# Example dictionary of correct words (could be generated from correct phrases dataset)
dictionary = set(word for phrase in correct_phrases for word in phrase.split())


# Correct all noisy phrases using n-grams
corrected_phrases = [correct_phrase_ngram(phrase, dictionary) for phrase in noisy_phrases]

# Output the corrected phrases
print("Corrected Phrases:")
for corrected_phrase in corrected_phrases:
    print(corrected_phrase)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
8 kg tomato 240 Rs
6 g onion Rs
4 kg cabbage 30 Rs
1 g potato 460 Rs
7 potato 180 Rs
5 g carrot 60 Rs
8 g cabbage 380 Rs
130 onion 200 Rs
10 g cabbage 340 Rs
7 g potato Rs
340 g potato 80 Rs
u g potato 430 Rs
340 onion 180 Rs
8 kg tomato 370 Rs
4 g potato 340 Rs
8 g carrot 340 Rs
130 kg potato 470 Rs
4 g carrot 10 Rs
1 kg potato 340
10 kg onion 70
1 tomato 330 Rs
a g carrot 410 Rs
70 kg potato 220 Rs
8 g carrot 90 Rs
4 kg potato Rs
1 kg tomato 250 Rs
6 kg onion Rs
9 kg potato Rs
6 g tomato 270 Rs
8 g cabbage 300 Rs
1 g tomato Rs
j kg carrot 320 Rs
7 kg potato 400 Rs
9 v carrot 400 Rs
carrot kg potato 180 Rs
7 kg onion 170 Rs
v kg onion 190 Rs
10 g carrot 360 Rs
4 kg tomato 450 Rs
5 g cabbage 260 Rs
10 u onion 400 Rs
6 g tomato 90 Rs
4 g cabbage 350 Rs
7 kg potato 460 Rs
7 kg tomato 150 Rs
6 kg cabbage 390 Rs
1 kg cabbage Rs
5 cabbage 80 Rs
9 g onion 60 Rs
5 kg cabbage 330 Rs
7 kg onion 450 Rs
1 g potato Rs
5 potato 120 Rs

In [65]:
# Evaluate model performance
# Evaluate model performance
def evaluate_model(correct_phrases, noisy_phrases, corrected_phrases):
    total_words = 0
    total_correct_words = 0
    total_corrected_words = 0

    for correct, noisy, corrected in zip(correct_phrases, noisy_phrases, corrected_phrases):
        correct_words_list = correct.split()
        corrected_words_list = corrected.split()

        total_words += len(correct_words_list)
        total_correct_words += sum(1 for cw, cw_corr in zip(correct_words_list, corrected_words_list) if cw == cw_corr)
        total_corrected_words += len(corrected_words_list)

    # Metrics
    accuracy = total_correct_words / total_words * 100
    precision = total_correct_words / (total_corrected_words or 1) * 100
    recall = total_correct_words / total_words * 100
    f1_score = 2 * (precision * recall) / (precision + recall or 1)

    return accuracy, precision, recall, f1_score



# Measure performance
import time
start_time = time.time()
accuracy, precision, recall, f1_score = evaluate_model(correct_phrases, noisy_phrases, corrected_phrases)
end_time = time.time()

# Output results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1-Score: {f1_score:.2f}%")
print(f"Runtime: {end_time - start_time:.4f} seconds")


Accuracy: 75.58
Precision: 80.98%
Recall: 75.58%
F1-Score: 78.19%
Runtime: 0.0106 seconds
