## Dependencies

In [57]:
!pip install clean-text




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
import numpy as np
import pandas as pd
import re
from cleantext import clean
import json
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import random
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
datatype = 'train' 

path = (
    'PIZZA_train.json' if datatype == 'train' else
    'PIZZA_dev.json' if datatype == 'dev' else
    'PIZZA_test.json' if datatype == 'test' else
    'unknown.json'  # Optional fallback
)

lines = (
    10000 if datatype == 'train' else
    348 if datatype == 'dev' else
    1357 if datatype == 'test' else
    0  
)

print(f"Path: {path}")
print(f"Lines: {lines}")


Path: PIZZA_train.json
Lines: 10000


## Parsing JSON File

In [60]:
#! load data given file path and number of line to load
def load_data(file_path,num_lines):
    data = []
    with open(file_path, 'r') as file: 
        for _ in range(num_lines):
            data.append(json.loads(file.readline())) 
    return data




def load_random_data(file_path, num_lines):
    data = []
    with open(file_path, 'r') as file:
        file.seek(0, 2)  # Move to the end of the file to get its size
        file_size = file.tell()
        
        for _ in range(num_lines):
            while True:
                # Pick a random position in the file
                random_pos = random.randint(0, file_size - 1)
                file.seek(random_pos)
                
                # Read to the end of the current line to avoid partial lines
                file.readline()
                line = file.readline()  # Read the next line (complete line)
                
                if line:  # Ensure it's not an empty line
                    try:
                        data.append(json.loads(line.strip()))
                        break  # Exit the loop for this line
                    except json.JSONDecodeError:
                        continue  # Skip if it's not valid JSON
    return data


In [61]:
data=load_random_data(path,lines) if datatype=='train' else load_data(path,lines)
print(data[:5])

[{'train.SRC': "i'd like a pizza with carrot ricotta and green olive with thin crust", 'train.EXR': '(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING CARROTS ) (TOPPING RICOTTA_CHEESE ) (TOPPING GREEN_OLIVES ) (STYLE THIN_CRUST ) ) )', 'train.TOP': "(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza with (TOPPING carrot ) (TOPPING ricotta ) and (TOPPING green olive ) with (STYLE thin crust ) ) )", 'train.TOP-DECOUPLED': '(ORDER (PIZZAORDER (NUMBER a ) (TOPPING carrot ) (TOPPING ricotta ) (TOPPING green olive ) (STYLE thin crust ) ) )'}, {'train.SRC': "i'd like three pizzas no american cheese and one sprite and three san pellegrinos", 'train.EXR': '(ORDER (DRINKORDER (NUMBER 3 ) (DRINKTYPE PELLEGRINO_SPARKLING_WATER ) ) (DRINKORDER (NUMBER 1 ) (DRINKTYPE SPRITE ) ) (PIZZAORDER (NUMBER 3 ) (NOT (TOPPING AMERICAN_CHEESE ) ) ) )', 'train.TOP': "(ORDER i'd like (PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and (DRINKORDER (NUMBER one ) (DRINKTYPE sprite ) ) and (DRINKORDER (NU

In [62]:
#! split json data
def getextensions(datatype):
    if datatype=='train':
        return 'train.SRC','train.EXR','train.TOP','train.TOP-DECOUPLED'
    elif datatype=='dev':
        return 'dev.SRC','dev.EXR','dev.TOP'
    else:
        return 'test.SRC','test.EXR','test.TOP'
def get_training_data(data,datatype='train'):
    values=getextensions(datatype)
    training_data = []
    training_exr=[]
    training_top=[]
    training_top_dec=[]
    for item in data:
        training_data.append(item[values[0]])  
        training_exr.append(item[values[1]])  
        training_top.append(item[values[2]])
        if datatype=='train':
            training_top_dec.append(item[values[3]])  
    return training_data,training_exr,training_top,training_top_dec
training_data,training_exr,training_top,training_top_dec=get_training_data(data,datatype)   
print(training_data[:2])
print(training_exr[:2])
print(training_top[:2])
print(training_top_dec[:2])


["i'd like a pizza with carrot ricotta and green olive with thin crust", "i'd like three pizzas no american cheese and one sprite and three san pellegrinos"]
['(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING CARROTS ) (TOPPING RICOTTA_CHEESE ) (TOPPING GREEN_OLIVES ) (STYLE THIN_CRUST ) ) )', '(ORDER (DRINKORDER (NUMBER 3 ) (DRINKTYPE PELLEGRINO_SPARKLING_WATER ) ) (DRINKORDER (NUMBER 1 ) (DRINKTYPE SPRITE ) ) (PIZZAORDER (NUMBER 3 ) (NOT (TOPPING AMERICAN_CHEESE ) ) ) )']
["(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza with (TOPPING carrot ) (TOPPING ricotta ) and (TOPPING green olive ) with (STYLE thin crust ) ) )", "(ORDER i'd like (PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and (DRINKORDER (NUMBER one ) (DRINKTYPE sprite ) ) and (DRINKORDER (NUMBER three ) (DRINKTYPE san pellegrinos ) ) )"]
['(ORDER (PIZZAORDER (NUMBER a ) (TOPPING carrot ) (TOPPING ricotta ) (TOPPING green olive ) (STYLE thin crust ) ) )', '(ORDER (PIZZAORDER (NUMBER three ) (NOT (TOPPING ame

In [63]:
#! a function to handle negations
def handle_negations(text):
    #! negations are based on training set
    negations_pattern = r"\b(?:no|not|without)\s+.*?\b(?=(?:[^\w\s]|$))"
    # print(re.findall(negations_pattern, text))
    text = re.sub(negations_pattern, lambda x: ' '.join([f'not_{word}' for word in x.group(0).split()]), text)
    return text

In [64]:
#!# lemmatize words with all possible pos tags
def lem_word(word):
    possible_pos = [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]
    lemmatizer = WordNetLemmatizer()
    for pos in possible_pos:
        word=lemmatizer.lemmatize(word,pos)
    return word


In [65]:
lemmatizer = WordNetLemmatizer()
#! stopwords list (adding not_)
stopwords = set(stopwords.words('english'))
stopwords.add('like')
not_stopwords = ['not_' + word for word in stopwords]
stopwords.update(not_stopwords)
# stopwords.discard('a')
# stopwords.discard('an')
# stopwords.discard('not')
# stopwords.discard('no')
# stopwords.discard('can')
# stopwords.discard('not_a')
# stopwords.discard('not_an')
# stopwords.discard('not_can')
# stopwords.discard('not_no')
stopwords=set()
# stopwords.add('and')
# stopwords.add('also')

In [66]:
#! tokenize training data and remove stop words
def preprocess_training_data(training_data, stopwords):
    # training_data = [handle_negations(order) for order in training_data]
    training_data = [word_tokenize(order) for order in training_data]
    training_data = [[word.lower() for word in order if word.lower() not in stopwords] for order in training_data]
    training_data = [clean(order, no_line_breaks=True, no_punct=True, no_currency_symbols=True) for order in training_data]
    print(training_data)
    #! remove d letter most probably garbage
    training_data = [re.sub(r'\bd\s+', '', order) for order in training_data]
    #! remove "can" at the beginning of the sentence
    # training_data=[re.sub(r'^can\s+', '', order) for order in training_data]
    training_data = [word_tokenize(order) for order in training_data]
    # training_data = [[lemmatizer.lemmatize(word) for word in order] for order in training_data]
    training_data = [[lem_word(word) for word in order] for order in training_data]
    return training_data
training_data = preprocess_training_data(training_data, stopwords)
print(training_data[:5])

['i d like a pizza with carrot ricotta and green olive with thin crust', 'i d like three pizzas no american cheese and one sprite and three san pellegrinos', 'a sprite and five three liter coke zeros and three 200 ml waters', 'four seven ups and a medium ginger ale and five 12 fluid ounce 7 ups', 'i d like a pizza with caramelized red onion arugula and lettuce', 'four pizzas with balsamic glaze and i d like three party size pies with pecorino cheese and roasted red pepper', 'a large sprite and a ginger ale and two san pellegrinos', 'four ice tea bottles and a threeliter diet ice tea', 'a sprite and a dr pepper and one 500ml pellegrino', 'i d like ten pizza with american cheese and just one pizza with barbecue pulled pork', 'i d like eleven pizza with pea and six fantas', 'two pizzas with not many barbecue chicken', 'i need four pizzas with oregano and without any ricotta', 'a bottle of ice tea and five 500 milliliter coke zeros and two 16 ounce diet ice teas', 'one large sprite and a g

In [67]:
#! save processed training dataset
path = (
    'training_data_processed.txt' if datatype == 'train' else
    'dev_data_processed.txt' if datatype == 'dev' else
    'dev_data_processed.txt' if datatype == 'test' else
    'unknown_data_processed.txt'  # Optional fallback
)

with open(path, 'a') as f: #### dev_data_processed.txt
    for item in training_data:
        f.write("%s\n" % item)

In [68]:
#! a utility function for extra parentheses ) removal 
#! handles COMPLEX_TOPPING, NOT,... parenthesis cases
def remove_unmatched_parentheses(input_string):
    result = list(input_string)  # Convert to list for mutability
    last_bracket_index=-1
    for i, char in enumerate(result):
        if char == ')' and i+2 < len(result):
            result[i] = ''  
            last_bracket_index=i
        elif char == '(':
            if last_bracket_index!=-1:
                result[last_bracket_index] = ')'
                last_bracket_index=-1
        elif char == ')' and i+2 >= len(result):
            result[i] = ''
            result[last_bracket_index] = ')'
    return ''.join(result)

In [69]:
#! get PIZZAORDER, DRINKORDER, NONE Labels 0=>PIZZAORDER, 1=>DRINKORDER, 2=>NONE

def get_order_category_labels(training_top, training_data, stopwords):
    order_category_labels = []
    for i, item in enumerate(training_top):
        order_category_labels.append([2] * len(training_data[i]))
        unwanted_keywords = r"\b(ORDER|SIZE|STYLE|TOPPING|COMPLEX_TOPPING|QUANTITY|NOT|NUMBER|DRINKTYPE|CONTAINERTYPE|VOLUME)\b"
        cleaned_string = re.sub('\('+unwanted_keywords, "", item)
        cleaned_string = [word for word in cleaned_string.split() if word.lower() not in stopwords]
        cleaned_string = ' '.join(cleaned_string)
        cleaned_string = remove_unmatched_parentheses(cleaned_string)
        order_regex = r"\((?:PIZZAORDER|DRINKORDER).*?\)"
        extracted_orders = re.findall(order_regex, cleaned_string)
        k = 0
        for order in extracted_orders:
            order = re.sub(r"[\(\)]", "", order)
            order=word_tokenize(order) #! fix id and don't bugs
            order=clean(order, no_line_breaks=True, no_punct=True, no_currency_symbols=True)
            order=re.sub(r'\bd\s+', '', order) #! for d removal
            tokens = word_tokenize(order)
            # tokens = [lemmatizer.lemmatize(word) for word in tokens]
            tokens = [lem_word(word) for word in tokens]
            j = 0
            to_index_train=training_data[i][k:]
            if 'pizzaorder' in tokens:
                tokens.remove('pizzaorder')
                for word in to_index_train:
                    if j == len(tokens):
                        break
                    if word == tokens[j]:
                        order_category_labels[i][k] = 0
                        j += 1
                    k += 1
            elif 'drinkorder' in tokens:
                tokens.remove('drinkorder')
                for word in to_index_train:
                    if j == len(tokens):
                        break
                    if word == tokens[j]:
                        order_category_labels[i][k] = 1
                        j += 1
                    k += 1
    return order_category_labels

order_category_labels = get_order_category_labels(training_top, training_data, stopwords)
print(order_category_labels[:5])
    

[[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 0, 0, 0, 0, 0, 2, 1, 1, 2, 1, 1, 1], [1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], [1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1], [2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [70]:
path = (
    'train_order_category_labels.txt' if datatype == 'train' else
    'dev_order_category_labels.txt' if datatype == 'dev' else
    'dev_order_category_labels.txt' if datatype == 'test' else
    'unknown_order_category_labels.txt'  # Optional fallback
)
for labels in order_category_labels:
    with open(path, 'a') as f: # dev
        f.write("%s\n" % labels)