<a href="https://colab.research.google.com/github/QueeneDelmarva/Thesis/blob/main/Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Thesis Code**

## Importing Libraries

In [29]:
# Connect to Gdrive
from google.colab import drive

# Connect to Gsheets
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Preprocessing
import re
import nltk

# Edit Distance
import Levenshtein
import difflib

# Generate Misspellings
import random
import string

# GUI
import tkinter as tk
import tkinter.font as tkFont

import string
from google.colab import auth
from google.auth import default

ModuleNotFoundError: ignored

In [4]:
# !pip install python-Levenshtein


## Connecting to Gdrive

In [5]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## Connecting to GSheets

In [6]:
# Define the scope and create the credentials using the file in your Google Drive
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('/content/drive/MyDrive/skripsi-394804-a98ea9715aa4.json', scope)

# Authenticate with gspread
client = gspread.authorize(creds)

# Open the Google Sheet by its title or URL
spreadsheet_key = '1RlZ8-XwwEueuCAq4nXGwq3ZmPDE-_0gP3PsroKeNFGo'
sheet = client.open_by_key(spreadsheet_key).sheet1

# Open the train_data Google Sheet by its title or URL for writing
train_data_sheet = client.open_by_key(spreadsheet_key).worksheet('train_data')
test_data_sheet = client.open_by_key(spreadsheet_key).worksheet('test_data')

# ## Read the data from the Gsheets
# # 1. Read individual cells
# cell_value = sheet.cell(3, 5).value
# print(cell_value)

# # 2. Read all values from the first worksheet
# data = sheet.get_all_values()

# # Print the data
# for row in data:
#     print(row)

## Preprocessing Data

In [7]:
# preprocess_dictonary = ['Noted', 'and', 'thanks', 'Please', 'as', 'follows', 'Copied']

def preprocess(word):
    # Separate words based on non-alphabetic characters
    words = re.findall(r'\b(?:[a-zA-Z0-9]+)\b', word)

    # Convert words to lowercase
    lowercase_words = [w.lower() for w in words]

    return lowercase_words

    # Add preprocess dictonary to the list of preprocessed words
    # preprocessed_words = lowercase_words + [g for g in preprocess_dictonary if g in word.lower()]

    # return preprocessed_words

### Radix Trie Node

In [8]:
# Defines a class representing a single node in the radix trie, holding information about its children and whether it marks the end of a word.
class RadixTrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

### Hashmap-based Trie Node

In [9]:
# Defines a class representing a single node in the radix trie, holding information about its children and whether it marks the end of a word.
class HashmapTrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

### Radix Trie Function

In [10]:
# Defines a class that implements a radix trie data structure, allowing efficient storage and retrieval of a set of strings with common prefixes.
class RadixTrie:
    def __init__(self):
        self.root = RadixTrieNode()

    # Adds a word to the Radix Trie by creating or traversing the appropriate nodes for each character in the word.
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = RadixTrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    # Finds all words in the Radix Trie that have the given prefix, returning them as a list of results.
    def search(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []  # Prefix not found, return an empty list
            node = node.children[char]
        return self.collect_words(node, prefix)

   # Recursively explores the Radix Trie's child nodes and appends the complete words to the results list when reaching the end of a word.
    def collect_words(self, node, current_prefix):
        results = []
        if node.is_end_of_word:
            results.append(current_prefix)
        for char, child in node.children.items():
            word = current_prefix + char
            results.extend(self.collect_words(child, word))
        return results

    def get_all_words(self):
        return self.collect_words(self.root, "")

    # def get_all_terms(self):
    #     # Collect all terms stored in the trie
    #     all_terms = []

    #     def traverse(node, prefix):
    #         if node.is_end_of_word:
    #             all_terms.append(prefix)
    #         for char, child in node.children.items():
    #             traverse(child, prefix + char)

    #     traverse(self.root, "")

    #     return all_terms

### Hashmap-based Trie

In [11]:
# Define a class represents a trie data structure that uses a hashmap to store children nodes, allowing efficient storage and retrieval of a set of strings with common prefixes.
class HashmapTrie:
    def __init__(self):
        self.root = HashmapTrieNode()

# Adds a word to the trie by traversing and creating nodes for each character in the word and marking the last node as the end of the word.
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = HashmapTrieNode()
            node = node.children[char]
        node.is_end_of_word = True

# Checks if a given prefix exists in the trie by traversing the nodes based on the characters of the prefix and returns whether the last node reached marks the end of a word.
    def search(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []  # Prefix not found, return an empty list
            node = node.children[char]
        return self.collect_words(node, prefix)

# Recursively traverses its child nodes and appends the complete words to the results list when reaching the end of a word.
    def collect_words(self, node, current_prefix):
        results = []
        if node.is_end_of_word:
            results.append(current_prefix)
        for char, child in node.children.items():
            word = current_prefix + char
            results.extend(self.collect_words(child, word))
        return results

# Returns all words stored in the trie by collecting words
    def get_all_words(self):
        return self.collect_words(self.root, "")

### Autocorrect with Levenshtein Edit Distance

In [12]:
class Autocorrect:
    def __init__(self, trie):
        self.trie = trie

    def correct(self, word, threshold=0.9):  # Adjust threshold as needed
        word = word.lower()
        suggestions = []

        # Traverse the trie to find similar words
        similar_words = self.trie.search_similar(word)

        # Calculate Jaro-Winkler distance and filter based on threshold
        for similar_word in similar_words:
            similarity = distance.get_jaro_distance(word, similar_word)
            if similarity >= threshold:
                suggestions.append(similar_word)

        if suggestions:
            return suggestions[0]  # Return the first suggestion
        else:
            return None

### Generate Misspellings

In [13]:
class MisspelledWordGenerator:
    def __init__(self, threshold=0.2):
        self.threshold = threshold

    def generate(self, word):
        misspelled_word = []
        for char in word:
            if random.random() < self.threshold:
                misspelled_word.append(random.choice(string.ascii_lowercase))
            else:
                misspelled_word.append(char)
        return ''.join(misspelled_word)

### Weight Parameters

In [35]:
class WeightEvaluator:
    def __init__(self, trie_structure):
        self.trie_structure = trie_structure

    def evaluate_weight_combinations(self, test_data, prefix_weights, suffix_weights, similarity_weights):
        best_score = float('-inf')
        best_weights = (0, 0, 0)  # Initialize with default weights
        results = []

        for w1 in prefix_weights:
            for w2 in suffix_weights:
                for w3 in similarity_weights:
                    evaluator = EvaluationPerformance(self.trie_structure, (w1, w2, w3))
                    evaluation_results = evaluator.evaluate(test_data)
                    total_score = sum(overall_score for _, _, overall_score in evaluation_results)

                    if total_score > best_score:
                        best_score = total_score
                        best_weights = (w1, w2, w3)
                        results = evaluation_results

        return best_weights, results

### Performance Evaluation

In [53]:
class EvaluationPerformance:
    def __init__(self, trie_structure, weight_settings):
        self.trie_structure = trie_structure
        self.w1, self.w2, self.w3 = weight_settings

    def calculate_prefix_score(self, word, dictionary):
        # Calculate and return the prefix score for the word and dictionary
        prefix_score = 0
        for term in dictionary:
            if term.startswith(word):
                prefix_score += len(word)
        return prefix_score

    def calculate_suffix_score(self, word, dictionary):
        # Calculate and return the suffix score for the word and dictionary
        suffix_score = 0
        for term in dictionary:
            reversed_term = term[::-1]
            reversed_word = word[::-1]
            if reversed_term.startswith(reversed_word):
                suffix_score += len(reversed_word)
        return suffix_score

    def calculate_similarity_score(self, word, term):
        # Calculate and return the similarity score for the word and term
        lcs = self.LongestCommonSubsequence(word, term)
        similarity_score = (2 * lcs) / (len(word) + len(term))
        return similarity_score

    def LongestCommonSubsequence(self, text1, text2):
        m, n = len(text1), len(text2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if text1[i - 1] == text2[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1] + 1
                else:
                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

        return dp[m][n]

    def evaluate(self, test_data):
        results = []
        for query, correct_term in test_data:
            dictionary = self.trie_structure.get_all_words()  # Use get_all_words() to get dictionary terms
            prefix_score = self.calculate_prefix_score(query, dictionary)
            suffix_score = self.calculate_suffix_score(query, dictionary)
            similarity_score = self.calculate_similarity_score(query, dictionary)

            # Calculate the overall score based on weights w1, w2, and w3
            overall_score = (self.w1 * prefix_score) + (self.w2 * suffix_score) + (self.w3 * similarity_score)

            results.append((query, correct_term, overall_score))

        return results

In [78]:
#2
class EvaluationPerformance:
    def __init__(self, weight_settings):
        self.w1, self.w2, self.w3 = weight_settings
        self.radix_trie = RadixTrie()
        self.hashmap_trie = HashmapTrie()

    def calculate_prefix_score(self, word, dictionary):
        # Calculate and return the prefix score for the word and dictionary
        prefix_score = 0
        for term in dictionary:
            if term.startswith(word):
                prefix_score += len(word)
        return prefix_score

    def calculate_suffix_score(self, word, dictionary):
        # Calculate and return the suffix score for the word and dictionary
        suffix_score = 0
        for term in dictionary:
            reversed_term = term[::-1]
            reversed_word = word[::-1]
            if reversed_term.startswith(reversed_word):
                suffix_score += len(reversed_word)
        return suffix_score

    def calculate_similarity_score(self, word, term):
        # Calculate and return the similarity score for the word and term
        lcs = len(LongestCommonSubsequence(word, term))
        similarity_score = (2 * lcs) / (len(word) + len(term))
        return similarity_score

    def evaluate(self, test_data, weight_settings):
        results = []
        for query, correct_term in test_data:
            prefix_score = self.calculate_prefix_score(query, self.trie_structure.get_all_words())
            suffix_score = self.calculate_suffix_score(query, self.trie_structure.get_all_words())
            similarity_score = self.calculate_similarity_score(query, correct_term)

            w1, w2, w3 = weight_settings

            # Calculate the overall score based on weights w1, w2, and w3
            overall_score = (w1 * prefix_score) + (w2 * suffix_score) + (w3 * similarity_score)

            results.append((query, correct_term, overall_score))

        return results

In [82]:
def main():
    # Define your test data here
    test_data = [
        ("groand", "ground"),
        ("dayt", "date"),
        ("tomorow", "tomorrow"),
        ("request", "requtst"),
        ("cnnficmeq", "confirmed"),
        # ... add more test cases
    ]

    # Create instances of the trie structures and the autocorrect class
    radix_trie = RadixTrie()
    hashmap_trie = HashmapTrie()

    # Insert words into trie structures here if needed

    # Evaluation settings
    prefix_weights = [0.15, 0.20, 0.25, 0.30, 0.35]
    suffix_weights = [0.15, 0.20, 0.25, 0.30, 0.35]
    similarity_weights = [1.0, 1.1, 1.2]  # Include variations for similarity weight

    print("Weight Study Results:")
    print("{:<10} | {:<10} | {:<10} | {:<10} | {:<10}".format("w1", "w2", "w3", "Accuracy", "MMR"))
    print("-" * 70)

    for w1 in prefix_weights:
      for w2 in suffix_weights:
        for w3 in similarity_weights:
            weight_settings = (w1, w2, w3)  # Create a tuple with the weight settings
            evaluator = EvaluationPerformance(radix_trie, hashmap_trie, weight_settings)  # Pass trie structures and weight settings
            evaluation_results = evaluator.evaluate(test_data)

            accuracy_values = [result[2] for result in evaluation_results]  # Use index 2 for Accuracy
            mmr_values = [result[3] for result in evaluation_results]  # Use index 3 for MMR

            average_accuracy = sum(accuracy_values) / len(accuracy_values)
            average_mmr = sum(mmr_values) / len(mmr_values)

            print("{:<10} | {:<10} | {:<10} | {:<10.3f} | {:<10.3f}".format(w1, w2, w3, average_accuracy, average_mmr))


if __name__ == "__main__":
    main()


Weight Study Results:
w1         | w2         | w3         | Accuracy   | MMR       
----------------------------------------------------------------------


TypeError: ignored

In [59]:
# def main():
#     # Create instances of the trie structures and the autocorrect class
#     radix_trie = RadixTrie()
#     hashmap_trie = HashmapTrie()

#     # Evaluation settings (prefix_weight, suffix_weight, similarity_weight)
#     weight_settings_list = [
#         (0.15, 0.20, 1.1),
#         (0.20, 0.20, 1.1),
#         (0.25, 0.20, 1.1),
#         # ... add more weight settings
#     ]

#     print("Weight Study Results:")
#     print("{:<10} | {:<10} | {:<10}".format("w1", "Accuracy", "MMR"))
#     print("-" * 30)

#     test_data = [
#         ("groand", "ground"),
#         ("dayt", "date"),
#         ("tomorow", "tomorrow"),
#         # ... add more test data
#     ]

#     for weight_settings in weight_settings_list:
#         evaluator = EvaluationPerformance(radix_trie, weight_settings)
#         evaluation_results = evaluator.evaluate(test_data)

#         accuracy_values = [result[2] for result in evaluation_results]
#         mmr_values = [result[2] for result in evaluation_results]  # Use index 2 for MMR

#         average_accuracy = sum(accuracy_values) / len(accuracy_values)
#         average_mmr = sum(mmr_values) / len(mmr_values)

#         print("{:<10} | {:<10.3f} | {:<10.3f}".format(weight_settings[0], average_accuracy, average_mmr))

# if __name__ == "__main__":
#     main()

Weight Study Results:
w1         | Accuracy   | MMR       
------------------------------
0.15       | 0.000      | 0.000     
0.2        | 0.000      | 0.000     
0.25       | 0.000      | 0.000     


### **Main Function**

In [34]:
# def main():
#     # Read words from the train_data worksheet
#     train_words = train_data_sheet.col_values(1)

#     # Create instances of the trie structures and the autocorrect class
#     radix_trie = RadixTrie()
#     hashmap_trie = HashmapTrie()
#     radix_autocorrect = Autocorrect(radix_trie)
#     hashmap_autocorrect = Autocorrect(hashmap_trie)

#     # Evaluation settings (prefix_weight, suffix_weight, similarity_weight)
#     weight_settings = (0.25, 0.20, 1.1)

#     # Create an instance of the EvaluationPerformance class
#     evaluator = EvaluationPerformance(radix_trie, weight_settings)

#     # Test data for evaluation
#     test_data = [
#         ("PgeprjcessedgData", "ProcessedData"),
#         ("dtae", "date"),
#         ("contknt", "contact"),
#         # ... add more test cases
#     ]

#     # Evaluate and print results
#     evaluation_results = evaluator.evaluate(test_data)
#     for query, correct_term, overall_score in evaluation_results:
#         print(f"Query: {query}")
#         print(f"Correct Term: {correct_term}")
#         print(f"Overall Score: {overall_score}")
#         print("---")

# if __name__ == "__main__":
#     main()

Query: PgeprjcessedgData
Correct Term: ProcessedData
Overall Score: 0.0
---
Query: date
Correct Term: date
Overall Score: 0.0
---
Query: from
Correct Term: from
Overall Score: 0.0
---
Query: to
Correct Term: to
Overall Score: 0.0
---
Query: oysjeet
Correct Term: outset
Overall Score: 0.0
---
Query: contknt
Correct Term: contact
Overall Score: 0.0
---
Query: 0w
Correct Term: ow
Overall Score: 0.0
---
Query: 02
Correct Term: 02
Overall Score: 0.0
---
Query: 2023
Correct Term: 2023
Overall Score: 0.0
---
Query: raymznd
Correct Term: raymond
Overall Score: 0.0
---
Query: koh
Correct Term: koh
Overall Score: 0.0
---
Query: acrb
Correct Term: carb
Overall Score: 0.0
---


In [None]:
    # # Create an instance of the MisspelledWordGenerator class
    # generator = MisspelledWordGenerator()

    # # Collect preprocessed words for batch writes
    # radix_batch_data = []
    # hashmap_batch_data = []


    # Read all values from the original_data worksheet
    # data = sheet.get_all_values()

  # # Generate and save misspelled words to the test_data worksheet
  #   for word in train_words:
  #     misspelled_word = generator.generate(word)
  #     test_data_sheet.append_row([misspelled_word])

  #   print("Misspelled words generated and saved to test_data worksheet.")


    # # Preprocess
    # for row in data:
    #     for word in row:
    #         preprocessed_words = preprocess(word)
    #         for preprocessed_word in preprocessed_words:
    #             if preprocessed_word:  # Skip empty words after preprocessing
    #                 radix_trie.insert(preprocessed_word)
    #                 hashmap_trie.insert(preprocessed_word)

    #                 hashmap_trie.insert(preprocessed_word)
    #                 hashmap_batch_data.append([preprocessed_word])

    #                 # Use batch writes to save preprocessed words in one API call for each trie structure
    #                 # train_data_sheet.append_rows(radix_batch_data)
    #                 # train_data_sheet.append_rows(hashmap_batch_data)

    # # User input for autocorrection
    # while True:
    #     user_input = input("Enter a word (or type 'exit' to quit): ")
    #     if user_input.lower() == 'exit':
    #         break

    #     # Fetch correct words from the train_data sheet
    #     correct_words = train_data_sheet.col_values(1)  # Assuming correct words are in the first column

    #     # Autocorrect the user input using Levenshtein distance
    #     closest_match = None
    #     closest_distance = float('inf')
    #     for correct_word in correct_words:
    #         distance = Levenshtein.distance(user_input, correct_word)
    #         if distance < closest_distance:
    #             closest_distance = distance
    #             closest_match = correct_word

    #     # Display the closest match and append it to the train_data sheet
    #     print(f"Closest Match: {closest_match}")
    #     if closest_match != user_input:
    #         train_data_sheet.append_row([f"User Input: {user_input}, Closest Match: {closest_match}"])