## Dependencies

In [69]:
!pip install clean-text



In [70]:
import numpy as np
import pandas as pd
import re
from cleantext import clean
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Parsing JSON File

In [71]:
#! load data given file path and number of line to load
def load_data(file_path,num_lines):
    data = []
    with open(file_path, 'r') as file: 
        for _ in range(num_lines):
            data.append(json.loads(file.readline())) 
    return data

In [72]:
data=load_data('PIZZA_train.json',10000)
print(data)

[{'train.SRC': 'can i have a party - sized pie without any bean', 'train.EXR': '(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZE ) (NOT (TOPPING BEANS ) ) ) )', 'train.TOP': '(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE party - sized ) pie without any (NOT (TOPPING bean ) ) ) )', 'train.TOP-DECOUPLED': '(ORDER (PIZZAORDER (NUMBER a ) (SIZE party - sized ) (NOT (TOPPING bean ) ) ) )'}, {'train.SRC': "i'd like one lunch - sized pizza without any caramelized red onions", 'train.EXR': '(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LUNCH_SIZE ) (NOT (TOPPING CARAMELIZED_ONIONS ) ) ) )', 'train.TOP': "(ORDER i'd like (PIZZAORDER (NUMBER one ) (SIZE lunch - sized ) pizza without any (NOT (TOPPING caramelized red onions ) ) ) )", 'train.TOP-DECOUPLED': '(ORDER (PIZZAORDER (NUMBER one ) (SIZE lunch - sized ) (NOT (TOPPING caramelized red onions ) ) ) )'}, {'train.SRC': 'can i have a large bbq pulled pork', 'train.EXR': '(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING BBQ_PULLED_PORK ) ) )', 'tra

In [73]:
#! split json data
def get_training_data(data):
    training_data = []
    training_exr=[]
    training_top=[]
    training_top_dec=[]
    for item in data:
        training_data.append(item['train.SRC'])
        training_exr.append(item['train.EXR'])
        training_top.append(item['train.TOP'])
        training_top_dec.append(item['train.TOP-DECOUPLED'])
    return training_data,training_exr,training_top,training_top_dec
training_data,training_exr,training_top,training_top_dec=get_training_data(data)
print(training_data[:2])
print(training_exr[:2])
print(training_top[:2])
print(training_top_dec[:2])

['can i have a party - sized pie without any bean', "i'd like one lunch - sized pizza without any caramelized red onions"]
['(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZE ) (NOT (TOPPING BEANS ) ) ) )', '(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LUNCH_SIZE ) (NOT (TOPPING CARAMELIZED_ONIONS ) ) ) )']
['(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE party - sized ) pie without any (NOT (TOPPING bean ) ) ) )', "(ORDER i'd like (PIZZAORDER (NUMBER one ) (SIZE lunch - sized ) pizza without any (NOT (TOPPING caramelized red onions ) ) ) )"]
['(ORDER (PIZZAORDER (NUMBER a ) (SIZE party - sized ) (NOT (TOPPING bean ) ) ) )', '(ORDER (PIZZAORDER (NUMBER one ) (SIZE lunch - sized ) (NOT (TOPPING caramelized red onions ) ) ) )']


In [74]:
#! a function to handle negations
def handle_negations(text):
    #! negations are based on training set
    negations_pattern = r"\b(?:no|not|without)\s+.*?\b(?=(?:[^\w\s]|$))"
    # print(re.findall(negations_pattern, text))
    text = re.sub(negations_pattern, lambda x: ' '.join([f'not_{word}' for word in x.group(0).split()]), text)
    return text

In [75]:
lemmatizer = WordNetLemmatizer()
#! stopwords list (adding not_)
stopwords = set(stopwords.words('english'))
stopwords.add('like')
not_stopwords = ['not_' + word for word in stopwords]
stopwords.update(not_stopwords)
stopwords.discard('a')
stopwords.discard('an')
stopwords.discard('not')
stopwords.discard('no')
stopwords.discard('can')
stopwords.discard('not_a')
stopwords.discard('not_an')
stopwords.discard('not_can')
stopwords.discard('not_no')

In [76]:
#! tokenize training data and remove stop words
def preprocess_training_data(training_data, stopwords):
    # training_data = [handle_negations(order) for order in training_data]
    training_data = [word_tokenize(order) for order in training_data]
    training_data = [[word.lower() for word in order if word.lower() not in stopwords] for order in training_data]
    training_data = [clean(order, no_line_breaks=True, no_punct=True, no_currency_symbols=True) for order in training_data]
    #! remove d letter most probably garbage
    training_data = [re.sub(r'\bd\s+', '', order) for order in training_data]
    #! remove "can" at the beginning of the sentence
    training_data=[re.sub(r'^can\s+', '', order) for order in training_data]
    training_data = [word_tokenize(order) for order in training_data]
    training_data = [[lemmatizer.lemmatize(word) for word in order] for order in training_data]
    return training_data
training_data = preprocess_training_data(training_data, stopwords)
print(training_data)

[['a', 'party', 'sized', 'pie', 'without', 'bean'], ['one', 'lunch', 'sized', 'pizza', 'without', 'caramelized', 'red', 'onion'], ['a', 'large', 'bbq', 'pulled', 'pork'], ['one', 'high', 'rise', 'dough', 'pie', 'american', 'cheese', 'a', 'lot', 'meatball'], ['a', 'party', 'sized', 'high', 'rise', 'dough', 'pie', 'a', 'lot', 'banana', 'pepper', 'pecorino', 'cheese'], ['four', 'two', 'liter', 'ice', 'tea', 'can'], ['large', 'pie', 'green', 'pepper', 'extra', 'peperonni'], ['a', 'large', 'vegetarian', 'pizza'], ['party', 'size', 'stuffed', 'crust', 'pie', 'american', 'cheese', 'mushroom'], ['one', 'personal', 'sized', 'artichoke'], ['pie', 'banana', 'pepper', 'peppperonis', 'extra', 'low', 'fat', 'cheese'], ['want', 'one', 'regular', 'pizza', 'without', 'fried', 'onion'], ['want', 'a', 'stuffed', 'crust', 'pizza', 'american', 'cheese', 'a', 'little', 'bit', 'peperonni'], ['one', 'party', 'sized', 'high', 'rise', 'dough', 'pizza', 'american', 'cheese', 'a', 'lot', 'peperonni'], ['pie', 'gr

In [77]:
#! save processed training dataset
with open('training_data_processed.txt', 'w') as f:
    for item in training_data:
        f.write("%s\n" % item)

In [78]:
#! a utility function for extra parentheses ) removal 
#! handles COMPLEX_TOPPING, NOT,... parenthesis cases
def remove_unmatched_parentheses(input_string):
    result = list(input_string)  # Convert to list for mutability

    # First pass: identify unmatched closing parentheses
    for i, char in enumerate(result):
        if char == ')' and i+2 < len(result) and result[i+2] != '(':
            result[i] = ''  # Mark unmatched ')' for removal
    return ''.join(result)

In [79]:
#! get PIZZAORDER, DRINKORDER, NONE Labels 0=>PIZZAORDER, 1=>DRINKORDER, 2=>NONE

def get_order_category_labels(training_top, training_data, stopwords):
    order_category_labels = []
    for i, item in enumerate(training_top):
        order_category_labels.append([2] * len(training_data[i]))
        unwanted_keywords = r"\b(ORDER|SIZE|STYLE|TOPPING|COMPLEX_TOPPING|QUANTITY|NOT|NUMBER|DRINKTYPE|CONTAINERTYPE|VOLUME)\b"
        cleaned_string = re.sub('\('+unwanted_keywords, "", item)
        cleaned_string = [word for word in cleaned_string.split() if word.lower() not in stopwords]
        cleaned_string = ' '.join(cleaned_string)
        cleaned_string = remove_unmatched_parentheses(cleaned_string)
        order_regex = r"\((?:PIZZAORDER|DRINKORDER).*?\)"
        extracted_orders = re.findall(order_regex, cleaned_string)
        k = 0
        for order in extracted_orders:
            order = re.sub(r"[\(\)]", "", order)
            order=clean(order, no_line_breaks=True, no_punct=True, no_currency_symbols=True)
            tokens = word_tokenize(order)
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
            j = 0
            if 'pizzaorder' in tokens:
                tokens.remove('pizzaorder')
                for word in training_data[i]:
                    if j == len(tokens):
                        break
                    if word == tokens[j]:
                        order_category_labels[i][k] = 0
                        j += 1
                    k += 1
            elif 'drinkorder' in tokens:
                tokens.remove('drinkorder')
                for word in training_data[i]:
                    if j == len(tokens):
                        break
                    if word == tokens[j]:
                        order_category_labels[i][k] = 1
                        j += 1
                    k += 1
    return order_category_labels

order_category_labels = get_order_category_labels(training_top, training_data, stopwords)
print(order_category_labels)
    

[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 

In [80]:
for labels in order_category_labels:
    with open('order_category_labels.txt', 'a') as f:
        f.write("%s\n" % labels)