In [20]:
import numpy as np
import pandas as pd
import re
import inflect
from cleantext import clean
import json
import ast
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import random
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
def load_data_labels(data_path, labels_path):
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f]
    return data, labels

In [22]:
data, labels = load_data_labels('training_data_processed.txt','train_order_category_labels.txt')
print(data[:5])
print(labels[:5])

[['can', 'i', 'have', 'one', 'large', 'bbq', 'pull', 'pork'], ['large', 'pie', 'with', 'green', 'pepper', 'and', 'with', 'extra', 'peperonni'], ['i', 'like', 'one', 'large', 'vegetarian', 'pizza'], ['party', 'size', 'stuff', 'crust', 'pie', 'with', 'american', 'cheese', 'and', 'with', 'mushroom'], ['can', 'i', 'have', 'one', 'personal', 'size', 'artichoke']]
[[2, 2, 2, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 0, 0, 0, 0]]


In [23]:
dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')
print(dev_data[:5])
print(dev_labels[:5])

[['i', 'want', 'to', 'order', 'two', 'medium', 'pizza', 'with', 'sausage', 'and', 'black', 'olive', 'and', 'two', 'medium', 'pizza', 'with', 'pepperoni', 'and', 'extra', 'cheese', 'and', 'three', 'large', 'pizza', 'with', 'pepperoni', 'and', 'sausage'], ['five', 'medium', 'pizza', 'with', 'tomato', 'and', 'ham'], ['i', 'need', 'to', 'order', 'one', 'large', 'vegetarian', 'pizza', 'with', 'extra', 'banana', 'pepper'], ['i', 'like', 'to', 'order', 'one', 'large', 'onion', 'and', 'pepper', 'pizza'], ['i', 'll', 'have', 'one', 'pie', 'along', 'with', 'pesto', 'and', 'ham', 'but', 'avoid', 'olive']]
[[2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 0, 0, 0, 0, 0, 0], [2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [24]:
with open('vocab.txt', 'r') as f:
    lines = f.readlines()
vocab = [line.strip() for line in lines if line.strip()]
print(vocab[:5])

['put', 'hello', 'dough', 'fifteen', 'no']


In [28]:
#! modify tokens with a smallprobability
import string
mod_probability = 0.05
def modify_token(token):
    num_mods = random.randint(1, 3)  
    for _ in range(num_mods):
        operation = random.choice(['remove', 'add'])
        
        if operation == 'remove' and len(token) > 1:
            idx_to_remove = random.randint(0, len(token) - 1)
            token = token[:idx_to_remove] + token[idx_to_remove + 1:]
        elif operation == 'add':
            random_letter = random.choice(string.ascii_lowercase)
            idx_to_add = random.randint(0, len(token))
            token = token[:idx_to_add] + random_letter + token[idx_to_add:]
    
    return token  

for i in range(len(data)):
    for j in range(len(data[i])):
        if random.random() < mod_probability:
            data[i][j] = modify_token(data[i][j])
print(data[:5])


[['can', 'ijk', 'hav', 'one', 'large', 'bbq', 'pull', 'pork'], ['large', 'pie', 'with', 'green', 'pepper', 'and', 'with', 'extra', 'peperonni'], ['i', 'like', 'nen', 'large', 'vegetarian', 'piza'], ['party', 'size', 'stuff', 'crust', 'spie', 'waiteh', 'wnamercan', 'cheese', 'and', 'with', 'mushroom'], ['can', 'i', 'have', 'one', 'personal', 'size', 'artichoke']]


In [29]:
def damerau_levenshtein_distance(word1, word2):
    len1, len2 = len(word1), len(word2)
    dp = [[0 for _ in range(len2 + 1)] for _ in range(len1 + 1)]

    # Initialize the matrix
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j

    # Compute distances
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            cost = 0 if word1[i - 1] == word2[j - 1] else 1

            dp[i][j] = min(dp[i - 1][j] + 1,     #! Deletion
                           dp[i][j - 1] + 1,     #! Insertion
                           dp[i - 1][j - 1] + cost)  #! Substitution
            if i > 1 and j > 1 and word1[i - 1] == word2[j - 2] and word1[i - 2] == word2[j - 1]:
                dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + 1)

    return dp[len1][len2]
def find_closest_match(input_word, vocab):
    min_distance = float('inf')
    closest_word = None

    for word in vocab:
        distance = damerau_levenshtein_distance(input_word, word)
        if distance < min_distance:
            min_distance = distance
            closest_word = word
             
    return closest_word, min_distance

In [32]:
for i in range(5):
    for j in range(len(data[i])):
        closest_word, min_distance = find_closest_match(data[i][j], vocab)
        if min_distance <= 3:
            data[i][j] = closest_word
print(data[:5])

[['can', 'it', 'have', 'one', 'large', 'bbq', 'pull', 'pork'], ['large', 'pie', 'with', 'green', 'pepper', 'and', 'with', 'extra', 'peperonni'], ['i', 'like', 'new', 'large', 'vegetarian', 'pica'], ['party', 'size', 'stuff', 'crust', 'spice', 'water', 'american', 'cheese', 'and', 'with', 'mushroom'], ['can', 'i', 'have', 'one', 'personal', 'size', 'artichoke']]
