In [23]:
import numpy as np
import pandas as pd
import re
import inflect
import json
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import random
import ast
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\omara\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [25]:
# Mapping POS tags to numbers
pos_to_number = {
    'CC': 1, 'CD': 2, 'DT': 3, 'EX': 4, 'FW': 5, 'IN': 6, 'JJ': 7, 
    'JJR': 8, 'JJS': 9, 'LS': 10, 'MD': 11, 'NN': 12, 'NNS': 13,
    'NNP': 14, 'NNPS': 15, 'PDT': 16, 'POS': 17, 'PRP': 18, 'PRP$': 19, 
    'RB': 20, 'RBR': 21, 'RBS': 22, 'RP': 23, 'TO': 24, 'UH': 25,
    'VB': 26, 'VBD': 27, 'VBG': 28, 'VBN': 29, 'VBP': 30, 'VBZ': 31,
    'WDT': 32, 'WP': 33, 'WP$': 34, 'WRB': 35
}

# Example of tagging a sentence
import nltk

sentence = " want learning natural language processing with Python fdsaf"
words = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(words)

# Map POS tags to numbers
tagged_with_numbers = [(word, pos_to_number.get(tag, -1)) for word, tag in tags]
numbers_only = [pos_to_number.get(tag, -1) for _, tag in tags]

print(tags)
print(tagged_with_numbers)
print(numbers_only)


[('want', 'NN'), ('learning', 'VBG'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('with', 'IN'), ('Python', 'NNP'), ('fdsaf', 'NN')]
[('want', 12), ('learning', 28), ('natural', 7), ('language', 12), ('processing', 12), ('with', 6), ('Python', 14), ('fdsaf', 12)]
[12, 28, 7, 12, 12, 6, 14, 12]


In [26]:
def load_data_labels(data_path, labels_path):
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f]
    return data, labels

In [27]:
data, labels = load_data_labels('training_data_processed.txt','train_order_category_labels.txt')
print(data[:5])
print(labels[:5])

[['can', 'i', 'have', 'one', 'large', 'bbq', 'pull', 'pork'], ['large', 'pie', 'with', 'green', 'pepper', 'and', 'with', 'extra', 'peperonni'], ['i', 'like', 'one', 'large', 'vegetarian', 'pizza'], ['party', 'size', 'stuff', 'crust', 'pie', 'with', 'american', 'cheese', 'and', 'with', 'mushroom'], ['can', 'i', 'have', 'one', 'personal', 'size', 'artichoke']]
[[2, 2, 2, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 0, 0, 0, 0]]


In [28]:
dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')
print(dev_data[:5])
print(dev_labels[:5])

[['i', 'want', 'to', 'order', 'two', 'medium', 'pizza', 'with', 'sausage', 'and', 'black', 'olive', 'and', 'two', 'medium', 'pizza', 'with', 'pepperoni', 'and', 'extra', 'cheese', 'and', 'three', 'large', 'pizza', 'with', 'pepperoni', 'and', 'sausage'], ['five', 'medium', 'pizza', 'with', 'tomato', 'and', 'ham'], ['i', 'need', 'to', 'order', 'one', 'large', 'vegetarian', 'pizza', 'with', 'extra', 'banana', 'pepper'], ['i', 'like', 'to', 'order', 'one', 'large', 'onion', 'and', 'pepper', 'pizza'], ['i', 'll', 'have', 'one', 'pie', 'along', 'with', 'pesto', 'and', 'ham', 'but', 'avoid', 'olive']]
[[2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 0, 0, 0, 0, 0, 0], [2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [30]:
train_tag_labels=[]
for order in data:
    order = [word.upper() if word == 'i' else word for word in order]
    tags= nltk.pos_tag(order)
    tags_numbers = [pos_to_number.get(tag, -1) for _, tag in tags]
    train_tag_labels.append(tags_numbers)

In [33]:
dev_tag_labels=[]
for order in dev_data:
    order = [word.upper() if word == 'i' else word for word in order]
    tags= nltk.pos_tag(order)
    tags_numbers = [pos_to_number.get(tag, -1) for _, tag in tags]
    dev_tag_labels.append(tags_numbers)

In [35]:
with open('train_tag_labels.txt', 'w') as f:
    for item in train_tag_labels:
        f.write("%s\n" % item)
with open('dev_tag_labels.txt', 'w') as f:
    for item in dev_tag_labels:
        f.write("%s\n" % item)