## Dependencies

In [1]:
!pip install gensim




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
print(tf.__version__)


2.18.0


In [3]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Masking, Bidirectional, Multiply, Lambda, Concatenate
from tensorflow.keras import Model
import gensim as gs
import ast
import numpy as np
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import re
import copy
from tensorflow.keras.callbacks import ReduceLROnPlateau
from Spelling_correction import find_closest_match

In [4]:
def load_data_labels(data_path, labels_path):
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f]
    return data, labels


def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(line.strip())  # Use strip() to remove leading/trailing whitespace
    return data

In [5]:
data, labels = load_data_labels('train_data_order_details.txt', 'train_labels_order_details.txt')
_,Xmodel1=load_data_labels('trian_data_order_category.txt','train_labels_order_category.txt')

In [6]:
# dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')
dev_data, dev_labels = load_data_labels('dev_data_order_details.txt', 'dev_labels_order_details.txt')
_,devXmodel1=load_data_labels('dev_data_order_category.txt','dev_model1.txt')
print(len(devXmodel1))
print(devXmodel1[:5])

847
[[2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 0, 0, 0, 0, 0, 0], [2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [7]:
pretrained_model = gs.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
#! get v Aand replace unknown words with unk token
def process_sentence(sentence, model):
    for i, word in enumerate(sentence):
        if word not in model:
            sentence[i] = 'unk'
    return sentence

data = [process_sentence(sentence, pretrained_model) for sentence in data]
print(data[:5])
vocab=set()
for sentence in data:
    vocab.update(sentence)
#! get word index for each word in vocab
word2idx = {word: idx for idx, word in enumerate(vocab)}

[['i', 'like', 'one', 'pizza', 'with', 'red', 'onion', 'fry', 'onion', 'unk', 'mozarella', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'anchovy', 'caramelize', 'red', 'onion', 'unk', 'roast', 'green', 'pepper', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'applewood', 'bacon', 'grill', 'pineapple', 'unk', 'shrimp', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'pesto', 'sauce', 'roast', 'pepper', 'unk', 'unk', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'unk', 'spicy', 'red', 'sauce', 'unk', 'mushroom', 'without', 'thin', 'crust']]


In [9]:
embedding_dim=300
input_dim=len(vocab)
output_dim=11
max_length=100

In [10]:
dev_data_copy = copy.deepcopy(dev_data)  # Deep copy of dev_data
for tokens in dev_data:
    for i,word in enumerate(tokens):
        if word not in vocab:
            tokens[i] = 'unk'
X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
X_categories_d = [[category for category in sentence_categories] for sentence_categories in devXmodel1]
X_categories_d = pad_sequences(X_categories_d, maxlen=max_length, padding='post', value=2)  # Default to "NEITHER" = 2
Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=0)

In [11]:
#! get embeddings matrix
def get_embeddings_matrix(model, vocab):
    vocab_size = len(vocab)
    embedding_dim = model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for i, word in enumerate(vocab):
        if word in model:
            embedding_matrix[i] = model[word]
    return embedding_matrix
embedding_matrix = get_embeddings_matrix(pretrained_model, vocab)

In [12]:
#! replace words with their index in vocab and pad sentences
X = [[word2idx.get(word, word2idx['unk']) for word in sentence] for sentence in data]
X=pad_sequences(X, maxlen=max_length, padding='post', value=-1)
X_categories = [[category for category in sentence_categories] for sentence_categories in Xmodel1]
X_categories = pad_sequences(X_categories, maxlen=max_length, padding='post', value=2)  # Default to "NEITHER" = 2
Y=pad_sequences(labels, maxlen=max_length, padding='post', value=0)

In [13]:


# Register the custom function
@tf.keras.utils.register_keras_serializable()
def create_category_mask(categories):
    category_to_mask = tf.constant([
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],  # PIZZA
        [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1],  # DRINK
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]   # NEITHER
    ], dtype=tf.float32)

    categories = tf.where(categories == -1, 2, categories)
    mask = tf.gather(category_to_mask, categories)
    return mask

# Register the output shape function
@tf.keras.utils.register_keras_serializable()
def create_category_mask_output_shape(input_shape):
    return (input_shape[0], input_shape[1], output_dim)

# Define token input and category input
input_tokens = Input(shape=(max_length,), dtype='int32', name='tokens')
input_categories = Input(shape=(max_length,), dtype='int32', name='categories')

x = Masking(mask_value=-1)(input_tokens)
x = Embedding(input_dim=len(vocab), output_dim=embedding_dim, 
              weights=[embedding_matrix], trainable=True)(x)

category_embedding = Embedding(input_dim=3, output_dim=8, trainable=True)(input_categories)

x = Concatenate(axis=-1)([x, category_embedding])

x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = Dropout(0.6)(x)

logits = Dense(output_dim, activation=None)(x)

mask = Lambda(create_category_mask, output_shape=create_category_mask_output_shape)(input_categories)
masked_logits = Multiply()([logits, mask])

output = tf.keras.activations.softmax(masked_logits, axis=-1)

model = Model(inputs=[input_tokens, input_categories], outputs=output)

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-9,
    verbose=1
)
model.fit(
    [X, X_categories],  # Input data: tokens and categories
    Y,                  # Target labels
    validation_data=([X_d, X_categories_d], Y_d),  # Validation data
    epochs=25,  # Number of epochs
    batch_size=512,  # Batch size
    callbacks=[lr_scheduler]  # Learning rate scheduler
)


Epoch 1/25




[1m 10/100[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m44s[0m 494ms/step - accuracy: 0.9209 - loss: 2.3705

KeyboardInterrupt: 

In [25]:
model.summary()

In [14]:
# model.save('Order_details_model.h5')
loaded_model = tf.keras.models.load_model('Order_details_model.keras')



## Evaluate model on training data

In [17]:
preds_train = model.predict([X, X_categories])  
preds_train = np.argmax(preds_train, axis=-1)  # Get the class with the highest probability


[1m1594/1594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step


In [18]:
count = 0
for i in range(len(X)):
    mask=X[i]!=-1
    if np.all(preds_train[i][mask]==Y[i][mask]):
        count+=1
print(f"Accuracy on training data: {count/len(X)}")

Accuracy on training data: 0.9836274509803922


## Evaluate model on dev data

In [15]:
preds_dev = loaded_model.predict([X_d,X_categories_d])
preds_dev = np.argmax(preds_dev, axis=-1)

count = 0  
last_index_error_count = 0  

for i in range(len(dev_data)):
    original_length = len(dev_data[i])
    
    pred_seq = preds_dev[i][:original_length]
    true_seq = dev_labels[i]  
    
    # Check if the sequence is entirely correct
    if (pred_seq == true_seq).all():
        count += 1
print(f"Accuracy on dev_data: {count / len(dev_data):.4f}")
print(f"Sequences with only last index error: {last_index_error_count}")
print(preds_dev[:3])
preds_dev = [seq[:len(dev_data[i])] for i, seq in enumerate(preds_dev)]
preds_dev = [list(seq) for seq in preds_dev] 
print(preds_dev[:3])


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
Accuracy on dev_data: 0.0000
Sequences with only last index error: 0
[[0 0 0 0 0 0 0 2 0 0 6 6 0 0 0 0 2 0 0 0 6 0 0 6 0 2 1 0 6 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [4 0 0 2 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 6 6 6 0 2 0 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 6, 6, 0, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 6, 0, 2, 1, 0, 6], [4, 0, 0, 2, 0, 0, 6], [0, 0, 0, 0, 6, 6, 6, 0, 2, 0, 2, 4]]


In [29]:
test_data,test_labels_order_category=load_data_labels('test_data_order_category.txt','test_out_vocab_fixed.txt')
print(test_data)
print(test_labels_order_category)
test_data_copy = copy.deepcopy(test_data)  # Deep copy of dev_data
for tokens in test_data:
    for i,word in enumerate(tokens):
        if word not in vocab:
            tokens[i] = 'unk'
X_test=[[word2idx[word] for word in sentence] for sentence in test_data]
X_test=pad_sequences(X_test, maxlen=max_length, padding='post', value=-1)
X_categories_test = [[category for category in sentence_categories] for sentence_categories in test_labels_order_category]
X_categories_test = pad_sequences(X_categories_test, maxlen=max_length, padding='post', value=2)

[['let', 'me', 'prefer', 'one', 'extra', 'large', 'saute', 'spinach', 'and', 'tomato', 'pizza', 'without', 'any', 'sausage'], ['could', 'you', 'give', 'me', 'one', 'exta', 'large', 'pizza', 'with', 'roast', 'red', 'pepper', 'and', 'more', 'cheese', 'but', 'no', 'sausage'], ['can', 'you', 'handle', 'this', 'order', 'one', 'pizza', 'make', 'it', 'etra', 'large', 'please', 'i', 'want', 'new', 'york', 'style', 'tomato', 'sauce', 'but', 'absolutely', 'no', 'onion', 'that', 's', 'important', 'that', 's', 'all'], ['let', 'me', 'try', 'five', 'lnch', 'pizza', 'with', 'garlic', 'onion', 'and', 'pesto'], ['i', 'need', 'you', 'to', 'order', 'me', 'one', 'pie', 'in', 'med', 'size', 'along', 'with', 'pesto', 'pan', 'and', 'ham'], ['i', 'ry', 'one', 'largesize', 'pie', 'with', 'gilled', 'chicken', 'and', 'please', 'add', 'some', 'one', 'tiny', 'bit', 'of', 'cheese', 'and', 'some', 'ham', 'thank'], ['let', 's', 'put', 'i', 'for', 'one', 'med', 'peto', 'sauce', 'and', 'ham', 'pizza', 'with', 'one', 'l

In [30]:
pred_test = model.predict([X_test, X_categories_test]) 
pred_test = np.argmax(pred_test, axis=-1)  # Get the class with the highest probability
# print(pred_test[:5])
pred_test = [seq[:len(test_data[i])] for i, seq in enumerate(pred_test)]  # Trim each sequence
pred_test = [list(seq) for seq in pred_test] 

# print(pred_test[:5])


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step


In [23]:

def combine_tokens(tokens, labels):
    combined_tokens = []
    combined_labels = []
    i = 0

    while i < len(tokens):
        if labels[i] != 0 and labels[i]!=4:  # Check for non-None labels
            # Start combining consecutive tokens with the same label
            combined_token = tokens[i]
            current_label = labels[i]
            i += 1

            while i < len(tokens) and labels[i] == current_label:
                combined_token += f"_{tokens[i]}"
                i += 1

            combined_tokens.append(combined_token)
            combined_labels.append(current_label)
        else:
            # Keep tokens with label 0 as is
            combined_tokens.append(tokens[i])
            combined_labels.append(labels[i])
            i += 1

    return combined_tokens, combined_labels

In [24]:
label_mapping = {
    0: "NONE",
    1: "NUMBER",
    2: "SIZE",
    3: "STYLE",
    4: "TOPPING",
    5: "QUANTITY",
    6: "NOT_TOPPING",
    7: "NOT_STYLE",
    8: "DRINKTYPE",
    9: "CONTAINERTYPE",
    10: "VOLUME",
}

def Get_EXR_Format(tokens, labels):
    suborders = []
    current_suborder = {
        "tokens": [],
        "labels": [],
        "has_number": False,
        "has_size": False,
        "has_style": False,
        "has_drinktype": False,
        "has_volume": False,
        "has_containertype": False
    }
    
    for token, label in zip(tokens, labels):
        if label == 1 and current_suborder["tokens"]:  # New suborder starts at 'Number'
            suborders.append(current_suborder)
            current_suborder = {
                "tokens": [],
                "labels": [],
                "has_number": False,
                "has_size": False,
                "has_style": False,
                "has_drinktype": False,
                "has_volume": False,
                "has_containertype": False
            }
        
        # If the label is SIZE and we already have a size, start a new suborder
        if label == 2 and current_suborder["has_size"] and current_suborder["tokens"]:
            suborders.append(current_suborder)
            current_suborder = {
                "tokens": [],
                "labels": [],
                "has_number": False,
                "has_size": False,
                "has_style": False,
                "has_drinktype": False,
                "has_volume": False,
                "has_containertype": False
            }
        
        if label == 3 and current_suborder["has_style"] and current_suborder["tokens"]:
            suborders.append(current_suborder)
            current_suborder = {
                "tokens": [],
                "labels": [],
                "has_number": False,
                "has_size": False,
                "has_style": False,
                "has_drinktype": False,
                "has_volume": False,
                "has_containertype": False
            }
        if label == 8 and current_suborder["has_drinktype"] and current_suborder["tokens"]:
            suborders.append(current_suborder)
            current_suborder = {
                "tokens": [],
                "labels": [],
                "has_number": False,
                "has_size": False,
                "has_style": False,
                "has_drinktype": False,
                "has_volume": False,
                "has_containertype": False
            }
        if label == 10 and current_suborder["has_volume"] and current_suborder["tokens"]:
            suborders.append(current_suborder)
            current_suborder = {
                "tokens": [],
                "labels": [],
                "has_number": False,
                "has_size": False,
                "has_style": False,
                "has_drinktype": False,
                "has_volume": False,
                "has_containertype": False
            }
        if label == 9 and current_suborder["has_containertype"] and current_suborder["tokens"]:
            suborders.append(current_suborder)
            current_suborder = {
                "tokens": [],
                "labels": [],
                "has_number": False,
                "has_size": False,
                "has_style": False,
                "has_drinktype": False,
                "has_volume": False,
                "has_containertype": False
            }
        
        current_suborder["tokens"].append(token)
        current_suborder["labels"].append(label)

        if label == 2: 
            current_suborder["has_size"] = True
        elif label == 3: 
            current_suborder["has_style"] = True
        elif label == 1:  
            current_suborder["has_number"] = True
        elif label == 8: 
            current_suborder["has_drinktype"] = True
        elif label == 10:
            current_suborder["has_volume"] = True
        elif label == 9:
            current_suborder["has_containertype"] = True
    
    # Append the last suborder
    if current_suborder["tokens"]:
        suborders.append(current_suborder)
    
    # Step 2: Add default Number if no Number found
    for suborder in suborders:
        if not suborder["has_number"]:
            suborder["tokens"].insert(0, "one")
            suborder["labels"].insert(0, 1)  # Number label
    
    # Step 3: Classify suborders
    order_details = []
    for suborder in suborders:
        tokens = suborder["tokens"]
        labels = suborder["labels"]
        if any(label in [8, 9, 10] for label in labels):  
            order_type = "DRINKORDER"
        elif any(label in [3, 4, 5, 6, 7] for label in labels): 
            order_type = "PIZZAORDER"
        else:
            continue  # Skip invalid suborders
        
        suborder_details = []
        i = 0
        while i < len(tokens):
            token = tokens[i]
            label = labels[i]
            if label == 5:  # Quantity
                if i != len(tokens) - 1:
                    suborder_details.append(
                        f"(COMPLEX_TOPPING (QUANTITY {token.upper()}) (TOPPING {tokens[i + 1].upper()}))"
                    )
                    i += 1
                else:
                    suborder_details.append(f"({label_mapping[4]} {token.upper()})")
            elif label in [6, 7]:  # NOT cases (NOT_TOPPING, NOT_STYLE)
                new_label = label_mapping[label][4:]  # Remove "NOT_" from the label string
                suborder_details.append(
                    f"(NOT ({new_label} {token.upper()}))"
                )
            elif label != 0:
                suborder_details.append(f"({label_mapping[label]} {token.upper()})")
            i += 1
        
        order_details.append(f"({order_type} {' '.join(suborder_details)})")
    
    return f"(ORDER {' '.join(order_details)})"

In [25]:
def parse_to_dict(s):
    def parse(tokens):
        token = tokens.pop(0)
        if token == '(':
            key = tokens.pop(0)
            nested = {}
            while len(tokens) > 0 and tokens[0] != ')':
                if tokens[0] == '(':
                    temp_dict=parse(tokens)
                    for k,v in temp_dict.items():
                        if k in nested:
                            nested[k].append(v)
                        elif k == 'DRINKORDER' or k == 'PIZZAORDER' or k =='TOPPING' or k =='NOT' or k =='COMPLEX_TOPPING' or k == 'STYLE':
                            nested[k]=[v]
                        else:
                            nested[k] = v
                else:
                    sub_key = tokens.pop(0)
                    if tokens[0] == ')':
                        tokens.pop(0)
                        return {key: sub_key}
            if len(tokens) > 0:
                tokens.pop(0)  
            return {key: nested}
        elif token == ')':
            return {}

    # Tokenize the string
    tokens = re.findall(r'\(|\)|\w+', s)
    return parse(tokens)

In [26]:
def process_toppings(order_json):

    order = json.loads(order_json)
    processed_order = {"ORDER": {"PIZZAORDER": [], "DRINKORDER": []}}
    
    for pizza in order["ORDER"].get("PIZZAORDER", []):
        toppings = pizza.get("TOPPING", [])
        complex_toppings = pizza.get("COMPLEX_TOPPING", [])
        excluded = pizza.get("NOT", {})
        excluded_toppings = []
        tops=[]
        styles=[]
        excluded_styles=[]
        all_styles=[]
        for exc in excluded:
            tops.append(exc.get("TOPPING", []))
            if exc.get("STYLE", False):
                excluded_styles.append(exc.get("STYLE", {})[0])
        styles=pizza.get("STYLE", [])         
        for top in tops:
            for i in range(len(top)):
                excluded_toppings.append(top[i])
            
        if "NOT" in pizza:
            del pizza["NOT"]
        if "STYLE" in pizza: 
            del pizza["STYLE"]
        if "TOPPING" in pizza:
            del pizza["TOPPING"]
        if "COMPLEX_TOPPING" in pizza:
            del pizza["COMPLEX_TOPPING"]
        all_toppings = []
        
        for topping in toppings:
            all_toppings.append({
                "NOT": False,
                "Quantity": None,  
                "Topping": topping
            })
        for style in styles:
            all_styles.append({
                "NOT": False,
                "TYPE": style
            })
        for style in excluded_styles:
            all_styles.append({
                "NOT": True,
                "TYPE": style
            })
        
        for topping in excluded_toppings:
            all_toppings.append({
                "NOT": True,
                "Quantity": None,  
                "Topping": topping
            })
        for complex_topping in complex_toppings:
            complex_quantity = complex_topping.get("QUANTITY", None)
            complex_topping_list = complex_topping.get("TOPPING", [])
            for topping in complex_topping_list:
                all_toppings.append({
                    "NOT": False,
                    "Quantity": complex_quantity,
                    "Topping": topping
                })
        for drink in order["ORDER"].get("DRINKORDER", []):
            processed_order["ORDER"]["DRINKORDER"].append(drink)
        # Add the pizza order with updated AllTopping
        new_pizza_order = {**pizza, "AllTopping": all_toppings, "STYLE": all_styles}
        processed_order["ORDER"]["PIZZAORDER"].append(new_pizza_order)
    return json.dumps(processed_order,indent=4)

In [27]:
def process_order(order_json):
    # Parse the input JSON
    order = json.loads(order_json)
    processed_order = {"ORDER": {"PIZZAORDER": [], "DRINKORDER": []}}
    
    for pizza in order["ORDER"].get("PIZZAORDER", []):
        number = pizza.get("NUMBER", None)
        size = pizza.get("SIZE", None)
        style = pizza.get("STYLE", None) 
        type_ = pizza.get("TYPE", None)  
        toppings = pizza.get("AllTopping", [])
        
        excluded_toppings = pizza.get("NOT", {}).get("TOPPING", [])
        toppings = [topping for topping in toppings if topping not in excluded_toppings]
        
        processed_order["ORDER"]["PIZZAORDER"].append({
            "NUMBER": number,
            "SIZE": size,
            "STYLE": style,
            "ALLTOPPING": toppings
        })
    for drink in order["ORDER"].get("DRINKORDER", []):
        number = drink.get("NUMBER", None)
        size = drink.get("SIZE", None)
        drink_type = drink.get("DRINKTYPE", None)
        container_type = drink.get("CONTAINERTYPE", None)

        processed_order["ORDER"]["DRINKORDER"].append({
            "NUMBER": number,
            "SIZE": size,
            "DRINKTYPE": drink_type,
            "CONTAINERTYPE": container_type
        })
    
    return json.dumps(processed_order, indent=4)

In [28]:
def process_json(data):
    if isinstance(data, dict):
        # Process dictionary keys and values recursively
        return {key: process_json(value) for key, value in data.items()}
    elif isinstance(data, list):
        # Process each element in the list recursively
        return [process_json(item) for item in data]
    elif isinstance(data, str):
        # Convert string to lowercase and handle underscores
        return data.lower().replace('_', ' ')
    else:
        # Return data as-is if it's not a string, list, or dictionary
        return data

In [32]:
import csv
import reformat_results
import os
folder_name = "json_output_files"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
dev_data_copy=load_data_labels('dev_data_order_details.txt','dev_labels_order_details.txt')[0]
test_data_copy=load_data_labels('test_data_order_category2.txt','test_data_order_category2.txt')[0]
EXRs = []
output_text_file = "EXRs_output.txt"
with open("top_format_results.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    
    # Write header
    writer.writerow(["id", "output"])
    # Process each sequence
    for i, sequence in enumerate(pred_test):
        combined_tokens, combined_labels = combine_tokens(test_data_copy[i], sequence)
        EXR_Format = Get_EXR_Format(combined_tokens, combined_labels)
        EXRs.append(EXR_Format)
        result = parse_to_dict(EXR_Format)
        json_string = json.dumps(result)
        processed_toppings = process_toppings(json_string)
        Json_Format=  process_order(processed_toppings)
        final_Json=process_json(json.loads(Json_Format))
        Top_Format = reformat_results.parse_tree(final_Json)
        # Top_Format_withnumbers=words_to_numbers(Top_Format)
        writer.writerow([i, Top_Format])
        # Write to CSV

with open(output_text_file, mode="w", encoding="utf-8") as text_file:
    for i, exr in enumerate(EXRs):
        text_file.write(f"EXR {i}:\n{exr}\n\n")

print(f"Results saved to top_format_results.csv and {output_text_file}")


Results saved to top_format_results.csv and EXRs_output.txt


In [31]:
# import os
# import json
# import csv
# import reformat_results

# def extract_row(file_path, row_index):
#     with open(file_path, 'r') as file:
#         lines = file.readlines()

#     Extract the rows starting with "(ORDER"
#     order_rows = [line.strip() for line in lines if line.strip().startswith("(ORDER")]
    
#     if row_index < len(order_rows):
#         return order_rows[row_index]
#     else:
#         return None  # Return None if the index is out of range

# Define file path and output directory
# file_path = 'EXRs_output.txt'
# output_folder = 'output_json'
# output_csv_file = 'top_format_results.csv'

# Ensure the output folder exists
# os.makedirs(output_folder, exist_ok=True)

# Open the CSV file for writing
# with open(output_csv_file, mode="w", newline="", encoding="utf-8") as csv_file:
#     writer = csv.writer(csv_file)

#     Write the CSV header
#     writer.writerow(["id", "output"])

#     Process rows and save results
#     for i in range(847):
#         row = extract_row(file_path, i)
#         if row is None:
#             break  # Exit the loop if the row is out of range
        
#         try:
#             Process the row into the desired format
#             result = parse_to_dict(row)  # Convert to dictionary
#             json_string = json.dumps(result, indent=4)  # Convert dictionary to JSON
#             processed_toppings = process_toppings(json_string)  # Process toppings
#             json_format = process_order(processed_toppings)  # Final JSON format
#             final_json = process_json(json.loads(json_format))
#             top_format = reformat_results.parse_tree(final_json)
#             writer.writerow([i, top_format])
#             print(f"Processed and saved row {i}")

#         except Exception as e:
#             print(f"Error processing row {i}: {e}")
