<a href="https://colab.research.google.com/github/QueeneDelmarva/Thesis/blob/main/Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Thesis Code**

## Importing Libraries

In [3]:
# Connect to Gdrive
from google.colab import drive

# Connect to Gsheets
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Preprocessing
import re
import nltk

# Edit Distance
import Levenshtein

# GUI
import tkinter as tk
import tkinter.font as tkFont

import string
from google.colab import auth
from google.auth import default

## Connecting to Gdrive

In [4]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## Connecting to GSheets

In [5]:
# Define the scope and create the credentials using the file in your Google Drive
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('/content/drive/MyDrive/skripsi-394804-a98ea9715aa4.json', scope)

# Authenticate with gspread
client = gspread.authorize(creds)

# Open the Google Sheet by its title or URL
spreadsheet_key = '1RlZ8-XwwEueuCAq4nXGwq3ZmPDE-_0gP3PsroKeNFGo'
sheet = client.open_by_key(spreadsheet_key).sheet1

# Open the train_data Google Sheet by its title or URL for writing
train_data_sheet = client.open_by_key(spreadsheet_key).worksheet('train_data')

# ## Read the data from the Gsheets
# # 1. Read individual cells
# cell_value = sheet.cell(3, 5).value
# print(cell_value)

# # 2. Read all values from the first worksheet
# data = sheet.get_all_values()

# # Print the data
# for row in data:
#     print(row)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preprocessing Data

In [7]:
# preprocess_dictonary = ['Noted', 'and', 'thanks', 'Please', 'as', 'follows', 'Copied']

def preprocess(word):
    # Separate words based on non-alphabetic characters
    words = re.findall(r'\b(?:[a-zA-Z0-9]+)\b', word)

    # Convert words to lowercase
    lowercase_words = [w.lower() for w in words]

    return lowercase_words

    # Add preprocess dictonary to the list of preprocessed words
    # preprocessed_words = lowercase_words + [g for g in preprocess_dictonary if g in word.lower()]

    # return preprocessed_words

### Radix Trie Node

In [8]:
# Defines a class representing a single node in the radix trie, holding information about its children and whether it marks the end of a word.
class RadixTrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

### Hashmap-based Trie Node

In [9]:
# Defines a class representing a single node in the radix trie, holding information about its children and whether it marks the end of a word.
class HashmapTrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

### Radix Trie Function

In [10]:
# Defines a class that implements a radix trie data structure, allowing efficient storage and retrieval of a set of strings with common prefixes.
class RadixTrie:
    def __init__(self):
        self.root = RadixTrieNode()

    # Adds a word to the Radix Trie by creating or traversing the appropriate nodes for each character in the word.
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = RadixTrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    # Finds all words in the Radix Trie that have the given prefix, returning them as a list of results.
    def search(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []  # Prefix not found, return an empty list
            node = node.children[char]
        return self.collect_words(node, prefix)

   # Recursively explores the Radix Trie's child nodes and appends the complete words to the results list when reaching the end of a word.
    def collect_words(self, node, current_prefix):
        results = []
        if node.is_end_of_word:
            results.append(current_prefix)
        for char, child in node.children.items():
            word = current_prefix + char
            results.extend(self.collect_words(child, word))
        return results

    def get_all_words(self):
        return self.collect_words(self.root, "")

### Hashmap-based Trie

In [11]:
# Define a class represents a trie data structure that uses a hashmap to store children nodes, allowing efficient storage and retrieval of a set of strings with common prefixes.
class HashmapTrie:
    def __init__(self):
        self.root = HashmapTrieNode()

# Adds a word to the trie by traversing and creating nodes for each character in the word and marking the last node as the end of the word.
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = HashmapTrieNode()
            node = node.children[char]
        node.is_end_of_word = True

# Checks if a given prefix exists in the trie by traversing the nodes based on the characters of the prefix and returns whether the last node reached marks the end of a word.
    def search(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []  # Prefix not found, return an empty list
            node = node.children[char]
        return self.collect_words(node, prefix)

# Recursively traverses its child nodes and appends the complete words to the results list when reaching the end of a word.
    def collect_words(self, node, current_prefix):
        results = []
        if node.is_end_of_word:
            results.append(current_prefix)
        for char, child in node.children.items():
            word = current_prefix + char
            results.extend(self.collect_words(child, word))
        return results

# Returns all words stored in the trie by collecting words
    def get_all_words(self):
        return self.collect_words(self.root, "")

### Autocorrect with Levenshtein Edit Distance

In [27]:
class Autocorrect:
    def __init__(self, trie, threshold=2):
        self.trie = trie
        self.threshold = threshold

    def correct(self, word):
        suggestions = []
        self._correct_recursive(self.trie.root, "", word, self.threshold, suggestions)
        if suggestions:
            return suggestions[0]
        return None

    def _correct_recursive(self, node, prefix, target, distance_threshold, suggestions):
      if node.is_end_of_word:
        distance = self._calculate_distance(prefix, target)
        if distance <= distance_threshold:
          suggestions.append(prefix)

      if not target or target[0] not in node.children:
        return

      child = node.children[target[0]]
      self._correct_recursive(child, prefix + target[0], target[1:], distance_threshold, suggestions)
      self._correct_recursive(child, prefix, target[1:], distance_threshold, suggestions)
      self._correct_recursive(child, prefix + target[0] + target[1:], target[2:], distance_threshold, suggestions)

    def _calculate_distance(self, source, target):
        m, n = len(source), len(target)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        for i in range(m + 1):
            dp[i][0] = i

        for j in range(n + 1):
            dp[0][j] = j

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if source[i - 1] == target[j - 1]:
                    cost = 0
                else:
                    cost = 1
                dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost)

        return dp[m][n]


### **Main Function**

In [24]:
def main():
    # Read all values from the original_data worksheet
    data = sheet.get_all_values()


    # Create instances of the trie structures and the autocorrect class
    radix_trie = RadixTrie()
    hashmap_trie = HashmapTrie()
    radix_autocorrect = Autocorrect(radix_trie, threshold=3)
    hashmap_autocorrect = Autocorrect(hashmap_trie, threshold=3)

    # Collect preprocessed words for batch writes
    radix_batch_data = []
    hashmap_batch_data = []

    # Take user input
    user_input = input("Enter a word: ")

    # Autocorrect the user input
    corrected_word_radix = radix_autocorrect.correct(user_input)
    corrected_word_hashmap = hashmap_autocorrect.correct(user_input)

    # Display autocorrected results
    print(f"Radix Autocorrect: {corrected_word_radix}")
    print(f"Hashmap Autocorrect: {corrected_word_hashmap}")


    # # Loop
    # for row in data:
    #     for word in row:
    #         preprocessed_words = preprocess(word)
    #         for preprocessed_word in preprocessed_words:
    #             if preprocessed_word:  # Skip empty words after preprocessing
    #                 radix_trie.insert(preprocessed_word)
    #                 hashmap_trie.insert(preprocessed_word)

    #                 hashmap_trie.insert(preprocessed_word)
    #                 hashmap_batch_data.append([preprocessed_word])

    # # Use batch writes to save preprocessed words in one API call for each trie structure
    # # train_data_sheet.append_rows(radix_batch_data)
    # # train_data_sheet.append_rows(hashmap_batch_data)

    #                 # Use the Autocorrect instances
    #                 corrected_word_radix = radix_autocorrect.correct(preprocessed_word)
    #                 corrected_word_hashmap = hashmap_autocorrect.correct(preprocessed_word)

    #                 if corrected_word_radix:
    #                     train_data_sheet.append_row([f"Radix Autocorrect: {corrected_word_radix}"])

    #                 if corrected_word_hashmap:
    #                     train_data_sheet.append_row([f"Hashmap Autocorrect: {corrected_word_hashmap}"])

if __name__ == "__main__":
    main()

Enter a word: Stoop
Radix Autocorrect: None
Hashmap Autocorrect: None
