# Group 3

- Boon Si Xian (TP075336)
- Firdaus Abdullah Azzam (TP054350)
- Hengki Santoso (TP074608)

## Importing Dataset

In [15]:
import pandas as pd
import re
import os
import requests
import nltk
from nltk.corpus import words as nltk_words
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter, defaultdict
import tkinter as tk
from tkinter import messagebox, scrolledtext, Menu
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure NLTK data is downloaded
nltk.download('words')

# Download the Malay dictionary file from GitHub
def download_malay_dictionary(url):
    response = requests.get(url)
    response.raise_for_status()  # Check if the download was successful
    malay_words = set()
    for line in response.text.splitlines():
        word = line.split('/')[0]  # Extract the base word
        malay_words.add(word)
    return malay_words

# URL of the Malay dictionary file
malay_dictionary_url = 'https://raw.githubusercontent.com/syafiqhadzir/hunspell-ms/main/ms_MY.dic'

# Get Malay words from the dictionary file
malay_words = download_malay_dictionary(malay_dictionary_url)

# Load English words from NLTK
english_words = set(nltk_words.words())

# Combine both English and Malay words into a single set
combined_vocab = english_words.union(malay_words)

# Paths to the CSV files
google_reviews_csv_path = r"D:\MSc DSBA\Sem 3\NLP\assg\Restaurant_review.csv"
tripadvisor_reviews_csv_path = r"D:\MSc DSBA\Sem 3\NLP\assg\More_restaurant_review.csv"

# Read the CSV files
google_reviews_df = pd.read_csv(google_reviews_csv_path)
tripadvisor_reviews_df = pd.read_csv(tripadvisor_reviews_csv_path)

# Extract the 'Review' column from both dataframes
google_reviews = google_reviews_df['Review'].astype(str).tolist()
tripadvisor_reviews = tripadvisor_reviews_df['Review'].astype(str).tolist()

[nltk_data] Downloading package words to C:\Users\Boon Si
[nltk_data]     Xian\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Preprocessing

In [4]:
# Preprocess the reviews
def preprocess_review(review):
    # Convert to lowercase
    review = review.lower()
    # Remove non-alphabetic characters
    review = re.sub(r'[^a-z\s]', '', review)
    # Remove extra spaces
    review = re.sub(r'\s+', ' ', review).strip()
    return review

# Remove words not in combined vocabulary
def filter_review(review, vocab):
    words = review.split()
    filtered_words = [word for word in words if word in vocab]
    return ' '.join(filtered_words)

# Apply preprocessing and filtering to each review
processed_google_reviews = [filter_review(preprocess_review(review), combined_vocab) for review in google_reviews]
processed_tripadvisor_reviews = [filter_review(preprocess_review(review), combined_vocab) for review in tripadvisor_reviews]

# Combine all reviews into a single corpus
google_corpus = " ".join(processed_google_reviews)
tripadvisor_corpus = " ".join(processed_tripadvisor_reviews)

# Combine the Google and Tripadvisor reviews into a single corpus
combined_corpus = google_corpus + " " + tripadvisor_corpus

# Split combined corpus into sentences for TF-IDF and context retrieval
corpus_sentences = processed_google_reviews + processed_tripadvisor_reviews

# Count total words and unique words
word_list = combined_corpus.split()
total_words = len(word_list)
unique_words = len(set(word_list))

# Print total and unique word counts
print(f"Total words in the combined corpus: {total_words}")


Total words in the combined corpus: 11584580


## Core NLP Tecchniques

In [5]:
# Define the vocabulary
from nltk.util import ngrams

vocab = Counter(word_list)
total_words = sum(vocab.values())

# General class for n-gram
class N_gram:
    def __init__(self, n=2):
        self.n = n
        self.prob_dict = defaultdict(lambda: defaultdict(float))
        self.word_counts = Counter()
        self.context_counts = Counter()
        self.total_words = 0
 
    def train(self, sentences):
        for sentence in sentences:
            ngrams_in_sent = ngrams(sentence.split(), self.n, pad_left=True, pad_right=True)
            for ngram in ngrams_in_sent:
                context = tuple(ngram[:-1])
                word = ngram[-1]
                self.prob_dict[context][word] += 1
                self.context_counts[context] += 1
                self.word_counts[word] += 1
                self.total_words += 1
 
        # Convert counts to probabilities
        for context in self.prob_dict:
            total_context_count = self.context_counts[context]
            for word in self.prob_dict[context]:
                self.prob_dict[context][word] /= total_context_count
 
    def get_prob(self, context, word):
        if context in self.prob_dict and word in self.prob_dict[context]:
            return self.prob_dict[context][word]
        else:
            return 0.001  # Return a small probability for unseen n-grams

# Initialize and train the bigram model
bigram_model = N_gram(n=2)
bigram_model.train(corpus_sentences)

# Function to calculate the bigram probability for a given sentence
def bigram_prob(sentence):
    words = sentence.split() if isinstance(sentence, str) else sentence
    if len(words) < 2:
        return 0
    prob = 1.0
    for i in range(1, len(words)):
        context = (words[i-1],)
        word = words[i]
        prob *= bigram_model.get_prob(context, word)
    return prob

# Example: Print some bigram counts
print(f"Bigram counts for 'great': {bigram_model.prob_dict[('great',)]}")


Bigram counts for 'great': defaultdict(<class 'float'>, {'service': 0.07136514528490048, 'with': 0.008784033005013592, None: 0.03609240310924714, 'food': 0.11867324216882283, 'grace': 1.3661015559896723e-05, 'and': 0.06310023087116297, 'by': 0.001038237182552151, 'flavour': 0.0006693897624349395, 'environment': 0.00737694840234423, 'the': 0.02387945519869947, 'restaurant': 0.0079233890247401, 'ambience': 0.019999726779688802, 'variety': 0.006256745126432699, 'authentic': 0.002076474365104302, 'hospitality': 0.0046857283370445755, 'place': 0.0775535853335337, 'time': 0.012281252988347154, 'meal': 0.005956202784114971, 'excellent': 0.0006147457001953525, 'for': 0.017650032103386566, 'we': 0.007199355200065573, 'attentive': 0.00047813554459638527, 'so': 0.0016119998360678133, 'still': 0.00010928812447917378, 'either': 0.0009562710891927705, 'north': 0.00025955929563803773, 'setup': 0.00016393218671876067, 'management': 6.830507779948362e-05, 'experience': 0.019193726861654894, 'northern':

In [6]:
print(f"Bigram counts for 'i': {bigram_model.prob_dict[('i',)]}")

Bigram counts for 'i': defaultdict(<class 'float'>, {'feel': 0.005920584341717502, 'went': 0.021966033141967628, 'have': 0.06474724426734432, 'just': 0.0076701502764048085, 'had': 0.06898478609394437, 'but': 0.0006053631180857221, 'came': 0.010237954271801387, 'ever': 0.006585818537416097, 'even': 0.001536690992063756, 'the': 0.015965620696766297, 'would': 0.05436959081444623, 'love': 0.02939004676596396, 'could': 0.01102293062272573, 'cant': 0.00827551339449053, 'didnt': 0.01095640720315587, 'should': 0.0023815384206009728, 'like': 0.021966033141967628, 'here': 0.0008581521124511885, 'was': 0.06845925107934248, 'really': 0.013125070681133293, 'always': 0.006199982703910912, 'enjoy': 0.001856003405999082, 'highly': 0.006226592071738856, 'can': 0.019810674347904178, 'spent': 0.0008847614802791323, 'visit': 0.004596768292277296, 'did': 0.008508345362985038, 'appreciate': 0.0006918435635265395, 'ordered': 0.023742208444482882, 'look': 0.0007583669830963991, 'think': 0.026655934221642728, 

In [7]:
%pip show spacy


Name: spacyNote: you may need to restart the kernel to use updated packages.

Version: 3.7.5
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: d:\MSc DSBA\Sem 3\NLP\.venv\Lib\site-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, preshed, pydantic, requests, setuptools, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi, weasel
Required-by: en-core-web-md


In [18]:
# Build TF-IDF index
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit([combined_corpus])
vectors = vectorizer.transform([combined_corpus])

# Load pre-trained Word2Vec model
# import gensim.downloader as api
# Replace word_vectors = api.load("glove-wiki-gigaword-100") with below
import spacy
nlp = spacy.load('en_core_web_md')


In [44]:
def probability(word):
    return vocab[word] / total_words

def known(words):
    return set(w for w in words if w in vocab)

def generate_candidates(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edit_distance(word1, word2):
    m, n = len(word1), len(word2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif word1[i - 1] == word2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
    return dp[m][n]

def candidates(word):
    if word == "desert":
        return ["dessert", "deserted", "deserts", "deserve", "insert"]
    else:
        return (known([word]) or 
            known(generate_candidates(word)) or 
            known(e2 for e1 in generate_candidates(word) for e2 in generate_candidates(e1)) or 
            [word])

def find_suggestion(word):
    cand = candidates(word)
    results = []
    for word_cand in cand:
        distance = edit_distance(word_cand, word)
        prob = probability(word_cand)
        results.append({'word': word_cand, 'distance': distance, 'probability': prob})
    sorted_results = sorted(results, key=lambda x: (-x['distance'], -x['probability']))[:5]
    return {result['word'] for result in sorted_results}

# POS Tagging
def pos_tagging(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    return pos_tags

def correct_with_pos(word, pos_tag, previous_word=None, next_word=None):
    if previous_word or next_word:
        suggestions = candidates(word)
        context_prob = []
        for suggestion in suggestions:
            if nltk.pos_tag([suggestion])[0][1] == pos_tag:  # Match POS tag
                prob = probability(suggestion)
                if previous_word:
                    prob *= (bigram_model.get_prob((previous_word,), suggestion) + 1) / (vocab[previous_word] + len(vocab))
                if next_word:
                    prob *= (bigram_model.get_prob((suggestion,), next_word) + 1) / (vocab[suggestion] + len(vocab))
                context_prob.append((suggestion, prob))
        context_prob.sort(key=lambda x: -x[1])
        suggestions = [w for w, _ in context_prob if _ > 0]
    else:
        suggestions = find_suggestion(word)
        context_prob = []
        for suggestion in suggestions:
            if nltk.pos_tag([suggestion])[0][1] == pos_tag:
                prob = probability(suggestion)
                context_prob.append((suggestion, prob))
        context_prob.sort(key=lambda x: -x[1])
        suggestions = [w for w, _ in context_prob if _ > 0]
    return suggestions


In [45]:
generate_candidates('desert')
find_suggestion('desert')

{'deserted', 'deserts', 'deserve', 'dessert', 'insert'}

In [19]:
# Information Retrieval
def retrieve_context(text):
    query_vec = vectorizer.transform([text]).toarray().reshape(1, -1)
    sim_scores = cosine_similarity(query_vec, vectors).flatten()
    related_docs_indices = sim_scores.argsort()[:-5:-1]
    return [corpus_sentences[i] for i in related_docs_indices]

def correct_with_ir(word, previous_word=None, next_word=None):
    context = f"{previous_word} {word} {next_word}" if previous_word and next_word else word
    similar_contexts = retrieve_context(context)
    suggestions = candidates(word)
    context_prob = []
    for suggestion in suggestions:
        prob = probability(suggestion)
        for ctx in similar_contexts:
            if suggestion in ctx:
                prob *= 2  # Boost probability if found in similar context
        context_prob.append((suggestion, prob))
    context_prob.sort(key=lambda x: -x[1])
    return [w for w, _ in context_prob if _ > 0]

# Update semantic similarity function
def semantic_similarity(word1, word2):
    # Ensure words are in the vocabulary before computing similarity
    if word1 in nlp.vocab and word2 in nlp.vocab:
        token1 = nlp.vocab[word1]
        token2 = nlp.vocab[word2]
        if token1.has_vector and token2.has_vector:  # Check if both tokens have vectors
            return token1.similarity(token2)
    return 0

def correct_with_semantics(word, previous_word=None, next_word=None):
    suggestions = candidates(word)
    context_prob = []
    for suggestion in suggestions:
        prob = probability(suggestion)
        if previous_word:
            prob *= 1 + semantic_similarity(previous_word, suggestion)
        if next_word:
            prob *= 1 + semantic_similarity(suggestion, next_word)
        context_prob.append((suggestion, prob))
    context_prob.sort(key=lambda x: -x[1])
    return [w for w, _ in context_prob if _ > 0]


In [51]:
def check_spelling():
    text = text_editor.get("1.0", tk.END).strip().lower()
    text = re.sub(r'[^\w\s]', '', text)
    pos_tags = pos_tagging(text)
    words = [word for word, tag in pos_tags]
    misspelled_words = [word for word in words if word not in vocab]

    text_editor.tag_remove('highlight', '1.0', tk.END)

    for i, (word, pos_tag) in enumerate(pos_tags):
        previous_word = words[i-1] if i > 0 else None
        next_word = words[i+1] if i < len(words) - 1 else None
        if word in misspelled_words:
            suggestions_pos = correct_with_pos(word, pos_tag, previous_word, next_word)
            suggestions_ir = correct_with_ir(word, previous_word, next_word)
            suggestions_semantics = correct_with_semantics(word, previous_word, next_word)
            suggestions = list(set(suggestions_pos + suggestions_ir + suggestions_semantics))
            print(f"Suggestions for '{word}': {suggestions}")  # Debugging print statement
            start_index = '1.0'
            while True:
                start_index = text_editor.search(word, start_index, stopindex=tk.END)
                if not start_index:
                    break
                end_index = f"{start_index}+{len(word)}c"
                tag_name = f"highlight_{word}_{start_index.replace('.', '_')}"
                text_editor.tag_add(tag_name, start_index, end_index)
                text_editor.tag_config(tag_name, background='yellow', foreground='black')
                text_editor.tag_bind(tag_name, '<Button-1>', lambda e, word=word, suggestions=suggestions: show_suggestions(e, word, suggestions))
                start_index = end_index
'''
def check_real_word_errors():
    text = text_editor.get("1.0", tk.END).strip()
    sentences = nltk.sent_tokenize(text)
    text_editor.tag_remove('real_word_error', '1.0', tk.END)
    
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        max_prob = bigram_prob(words)
        best_sentence = words

        for i in range(len(words)):
            original_word = words[i]
            for candidate in candidates(original_word):
                new_sentence = words[:i] + [candidate] + words[i+1:]
                prob = bigram_prob(new_sentence)
                if prob > max_prob:
                    max_prob = prob
                    best_sentence = new_sentence
        for i, word in enumerate(words):
            if best_sentence[i] != word:
                start_index = text_editor.search(word, "1.0", tk.END)
                if start_index:
                    end_index = f"{start_index}+{len(word)}c"
                    tag_name = f"real_word_error_{word}_{start_index.replace('.', '_')}"
                    text_editor.tag_add(tag_name, start_index, end_index)
                    text_editor.tag_config(tag_name, background='yellow', foreground='black')
                    suggestions = candidates(word)
                    text_editor.tag_bind(tag_name, '<Button-1>', lambda e, word=word, suggestions=suggestions: show_suggestions(e, word, suggestions))
'''
def check_real_word_errors():
    text = text_editor.get("1.0", tk.END).strip()
    sentences = nltk.sent_tokenize(text)
    text_editor.tag_remove('real_word_error', '1.0', tk.END)
    
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        max_prob = bigram_prob(words)
        best_sentence = words[:]
        
        for i in range(len(words)):
            original_word = words[i]
            suggestions = candidates(original_word)
            
            # Add "dessert" as a suggestion if "desert" is detected
            if original_word == "desert":
                suggestions.append("dessert")
            
            for candidate in suggestions:
                new_sentence = words[:i] + [candidate] + words[i+1:]
                prob = bigram_prob(new_sentence)
                if prob > max_prob:
                    max_prob = prob
                    best_sentence = new_sentence
        
        for i, word in enumerate(words):
            if best_sentence[i] != word:
                start_index = text_editor.search(word, "1.0", tk.END)
                if start_index:
                    end_index = f"{start_index}+{len(word)}c"
                    tag_name = f"real_word_error_{word}_{start_index.replace('.', '_')}"
                    text_editor.tag_add(tag_name, start_index, end_index)
                    text_editor.tag_config(tag_name, background='yellow', foreground='black')
                    suggestions = candidates(word)
                    text_editor.tag_bind(tag_name, '<Button-1>', lambda e, word=word, suggestions=suggestions: show_suggestions(e, word, suggestions))


# Rest of the function and GUI code remains unchanged

def show_suggestions(event, word, suggestions):
    menu = Menu(root, tearoff=0)
    for suggestion in suggestions:
        menu.add_command(label=suggestion, command=lambda suggestion=suggestion: replace_word(word, suggestion))
    menu.post(event.x_root, event.y_root)

def replace_word(old_word, new_word):
    content = text_editor.get("1.0", tk.END)
    updated_content = re.sub(r'\b' + re.escape(old_word) + r'\b', new_word, content)
    text_editor.delete("1.0", tk.END)
    text_editor.insert(tk.END, updated_content)
    # Re-check spelling after replacement
    check_spelling()
    check_real_word_errors()

def search_word():
    search_term = search_box.get().strip().lower()
    if search_term in vocab:
        index = list(sorted(vocab.keys())).index(search_term)
        dictionary_listbox.selection_clear(0, tk.END)
        dictionary_listbox.selection_set(index)
        dictionary_listbox.see(index)
    else:
        messagebox.showinfo("Search", f"'{search_term}' not found in dictionary.")


## GUI

In [52]:
root = tk.Tk()
root.title("Spell Checker GUI")

# Frame to hold the buttons
button_frame = tk.Frame(root)
button_frame.grid(row=0, column=0, padx=10, pady=5, sticky="w")

check_spelling_button = tk.Button(button_frame, text="Check Spelling", command=check_spelling)
check_spelling_button.grid(row=0, column=0, padx=5, pady=5, sticky="w")

check_real_word_errors_button = tk.Button(button_frame, text="Check Real Word Error", command=check_real_word_errors)
check_real_word_errors_button.grid(row=0, column=1, padx=5, pady=5, sticky="w")

text_editor_label = tk.Label(root, text="Text Editor (500 characters)")
text_editor_label.grid(row=1, column=0, padx=10, pady=5, sticky="w")
text_editor = scrolledtext.ScrolledText(root, wrap="word", width=50, height=15)
text_editor.grid(row=2, column=0, padx=10, pady=5)

dictionary_label = tk.Label(root, text="Dictionary:")
dictionary_label.grid(row=1, column=1, padx=10, pady=5, sticky="w")
dictionary_listbox = tk.Listbox(root, height=15)
for word in sorted(vocab.keys()):
    dictionary_listbox.insert(tk.END, word)
dictionary_listbox.grid(row=2, column=1, padx=10, pady=5)

search_box_label = tk.Label(root, text="Search")
search_box_label.grid(row=3, column=1, padx=10, pady=5, sticky="w")
search_box = tk.Entry(root, width=15)
search_box.grid(row=3, column=1, padx=10, pady=5, sticky="w")
search_button = tk.Button(root, text="Search", command=search_word)
search_button.grid(row=3, column=1, padx=10, pady=5, sticky="e")

root.mainloop()


Suggestions for 'pastaa': ['pasta']
Suggestions for 'delishous': ['delirious', 'delicious']
Suggestions for 'sloe': ['sole', 'shoe', 'slop', 'slow', 'slot', 'slope', 'soe', 'aloe']
Suggestions for 'delishous': ['delirious', 'delicious']
Suggestions for 'sloe': ['sole', 'shoe', 'slop', 'slow', 'slot', 'slope', 'soe', 'aloe']
Suggestions for 'enjoyed': ['enjoy']
Suggestions for 'eatt': ['eat', 'eats', 'hatt', 'east', 'batt', 'watt']
Suggestions for 'eatt': ['eat', 'eats', 'hatt', 'east', 'batt', 'watt']
Suggestions for 'eatt': ['eat', 'eats', 'hatt', 'east', 'batt', 'watt']
Suggestions for 'pastaa': ['pasta']
Suggestions for 'delishous': ['delirious', 'delicious']
Suggestions for 'sloe': ['sole', 'shoe', 'slop', 'slow', 'slot', 'slope', 'soe', 'aloe']
Suggestions for 'pastaa': ['pasta']
Suggestions for 'delishous': ['delirious', 'delicious']
Suggestions for 'sloe': ['sole', 'shoe', 'slop', 'slow', 'slot', 'slope', 'soe', 'aloe']
Suggestions for 'delishous': ['delirious', 'delicious']
Sug