<a href="https://colab.research.google.com/github/QueeneDelmarva/Thesis/blob/main/Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Thesis Code**

## Importing Libraries

In [1]:
# Connect to Gdrive
from google.colab import drive

# Connect to Gsheets
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Preprocessing
import re
import nltk

import string
from google.colab import auth
from google.auth import default

## Connecting to Gdrive

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Connecting to GSheets

In [3]:
# Define the scope and create the credentials using the file in your Google Drive
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('/content/drive/MyDrive/skripsi-394804-a98ea9715aa4.json', scope)

# Authenticate with gspread
client = gspread.authorize(creds)

# Open the Google Sheet by its title or URL
spreadsheet_key = '1RlZ8-XwwEueuCAq4nXGwq3ZmPDE-_0gP3PsroKeNFGo'
sheet = client.open_by_key(spreadsheet_key).sheet1

# Open the train_data Google Sheet by its title or URL for writing
train_data_sheet = client.open_by_key(spreadsheet_key).worksheet('train_data')

# ## Read the data from the Gsheets
# # 1. Read individual cells
# cell_value = sheet.cell(3, 5).value
# print(cell_value)

# # 2. Read all values from the first worksheet
# data = sheet.get_all_values()

# # Print the data
# for row in data:
#     print(row)

## Preprocessing Data

In [4]:
# preprocess_dictonary = ['Noted', 'and', 'thanks', 'Please', 'as', 'follows', 'Copied']

def preprocess(word):
    # Separate words based on non-alphabetic characters
    words = re.findall(r'\b(?:[a-zA-Z0-9]+)\b', word)

    # Convert words to lowercase
    lowercase_words = [w.lower() for w in words]

    return lowercase_words

    # Add preprocess dictonary to the list of preprocessed words
    # preprocessed_words = lowercase_words + [g for g in preprocess_dictonary if g in word.lower()]

    # return preprocessed_words

## **Radix Trie**
---
Classes:
*   Node
*   Function

### Radix Trie Node

In [5]:
# Defines a class representing a single node in the radix trie, holding information about its children and whether it marks the end of a word.
class RadixTrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

### Hashmap-based Trie Node

In [6]:
# Defines a class representing a single node in the radix trie, holding information about its children and whether it marks the end of a word.
class HashmapTrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

### Radix Trie Function

In [7]:
# Defines a class that implements a radix trie data structure, allowing efficient storage and retrieval of a set of strings with common prefixes.
class RadixTrie:
    def __init__(self):
        self.root = RadixTrieNode()

    # Adds a word to the Radix Trie by creating or traversing the appropriate nodes for each character in the word.
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = RadixTrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    # Finds all words in the Radix Trie that have the given prefix, returning them as a list of results.
    def search(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []  # Prefix not found, return an empty list
            node = node.children[char]
        return self.collect_words(node, prefix)

   # Recursively explores the Radix Trie's child nodes and appends the complete words to the results list when reaching the end of a word.
    def collect_words(self, node, current_prefix):
        results = []
        if node.is_end_of_word:
            results.append(current_prefix)
        for char, child in node.children.items():
            word = current_prefix + char
            results.extend(self.collect_words(child, word))
        return results

    def get_all_words(self):
        return self.collect_words(self.root, "")

### Hashmap-based Trie

In [8]:
# Define a class represents a trie data structure that uses a hashmap to store children nodes, allowing efficient storage and retrieval of a set of strings with common prefixes.
class HashmapTrie:
    def __init__(self):
        self.root = TrieNode()

# Adds a word to the trie by traversing and creating nodes for each character in the word and marking the last node as the end of the word.
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = HashmapTrieNode()
            node = node.children[char]
        node.is_end_of_word = True

# Checks if a given prefix exists in the trie by traversing the nodes based on the characters of the prefix and returns whether the last node reached marks the end of a word.
    def search(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []  # Prefix not found, return an empty list
            node = node.children[char]
        return self.collect_words(node, prefix)

# Recursively traverses its child nodes and appends the complete words to the results list when reaching the end of a word.
    def collect_words(self, node, current_prefix):
        results = []
        if node.is_end_of_word:
            results.append(current_prefix)
        for char, child in node.children.items():
            word = current_prefix + char
            results.extend(self.collect_words(child, word))
        return results

# Returns all words stored in the trie by collecting words
    def get_all_words(self):
        return self.collect_words(self.root, "")

### **Main Function**

In [11]:
def main(sheet):
    # Read all values from the original_data worksheet
    data = sheet.get_all_values()

    # Build the Radix Trie from the data
    radix_trie = RadixTrie()
    for row in data:
        for word in row:
            preprocessed_words = preprocess(word)
            for preprocessed_word in preprocessed_words:
                if preprocessed_word:  # Skip empty words after preprocessing
                    radix_trie.insert(preprocessed_word)
                    # Save the preprocessed word to the train_data sheet
                    train_data_sheet.append_row([preprocessed_word])

    # Build the Hashmap-based Trie from the data
    hashmap_trie = HashmapTrie()
    for row in data:
        for word in row:
            preprocessed_words = preprocess(word)
            for preprocessed_word in preprocessed_words:
                if preprocessed_word:  # Skip empty words after preprocessing
                    hashmap_trie.insert(preprocessed_word)
                    # Save the preprocessed word to the train_data sheet
                    train_data_sheet.append_row([preprocessed_word])


    # search_word = "a"
    # Define the words you want to search for
    search_words = [""]

    # Initialize the column index to start at column 2 (B)
    column_index = 2

    for search_word in search_words:
        # Perform search operations on both trie structures
        radix_results = radix_trie.search(search_word)
        hashmap_results = hashmap_trie.search(search_word)

        # Combine the results from both trie structures
        all_results = radix_results + hashmap_results

        # Save the results to the train_data sheet in the corresponding column
        for i, result in enumerate(all_results, start=1):
            cell = train_data_sheet.cell(i, column_index)
            cell.value = result

        # Move to the next column for the next search word
        column_index += 1

    # Save the changes to the train_data sheet
    train_data_sheet.update_cells(train_data_sheet.range('B:B'))

    # print("Radix Trie search results for", search_word, ":", radix_results)
    # print("Hashmap-based Trie search results for", search_word, ":", hashmap_results)

if __name__ == "__main__":
    main(sheet)

APIError: ignored