In [1]:
# python -m spacy download en_core_web_sm

import spacy
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from spellchecker import SpellChecker

2023-01-17 14:17:18.930707: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-01-17 14:17:20.497373: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-01-17 14:17:20.497417: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-01-17 14:17:20.497452: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (instance-alexa): /proc/driver/nvidia/version does not exist


In [2]:
def stopword_removal(text):
    my_file = open("stopwords.txt", "r")
    stopword_list = my_file.read().split("\n")
    # print("stopwords")
    # print(stopword_list)
    my_file.close()
    
    tokens = []
    # tokenization
    doc = nlp(text, disable=["tagger", "parser", "attribute_ruler", "tok2vec", ])
    # stop word removal
    for t in doc:
        if (not t.is_stop or t.like_num) and not t.is_punct and not t.is_space:
            if not str(t) in stopword_list:
                tokens.append(t.lemma_.lower())
    return tokens

def stemming(tokens):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(token) for token in tokens]

def spell_check(tokens):
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    words = []
    # print("misspelled", misspelled)
    for word in tokens:
        if word in misspelled:
            if not spell.correction(word) is None:
                words.append(spell.correction(word))
            else:
                words.append(word)
        else:
            words.append(word)
    return words

def key_words(tokens):
    tokens = nlp(" ".join(tokens))
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB', 'NUM']
    return [token.text for token in tokens if token.pos_ in pos_tag]

def preprocess_query(query):    
    tokens = stopword_removal(query)
    # print("stop", tokens)
    tokens = spell_check(tokens)
    # print("spell", tokens)
    tokens = key_words(tokens)
    # print("key", tokens)
    tokens = stemming(tokens)
    # print("stem", tokens)
    parsed_query = " ".join(tokens)
    print(query, " - ", parsed_query)
    return parsed_query


# text = ""
# nlp = spacy.load('en_core_web_sm', disable=['ner'])
# preprocess_query(text)


In [3]:
query_types = ["cooking", "diy"]

for q_type in query_types:
    

    df = pd.read_csv(f"../queries/{q_type}_raw.csv")

    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    df["target query"] = df["raw query"].apply(lambda x: preprocess_query(x))

    ids = [f'query-{i}' for i in range(0, len(df))]
    df.insert(0, 'id', ids)

    path_to = f"../queries/{q_type}.csv"
    print(f"Saving {q_type} dataset to {path_to}")
    df.to_csv(path_to, index=False)




how to spatchcock a turkey  -  spatchcock turkey
I want an easy to make dessert for christmas  -  easi dessert christma
any recommendations for a gluten free appetizer  -  gluten free appet
how to make a green goddess salad  -  green goddess salad
how to make a classic english trifle  -  classic english trifl
I want a scary pumpkin pie  -  scari pumpkin pie
ideas for easy to carry tailgate food  -  easi carri tailgat food
I want to make halloween candy  -  halloween candi
how to make a gluten free peach crisp  -  gluten free peach crisp
healthy zucchini bread  -  healthi zucchini bread
recommend some asian street foods  -  asian street food
dinners by batali  -  dinner battl
traditional japanese soup  -  tradit japanes soup
how to make singaporean chicken lunch  -  singapor chicken lunch
a romantic dinner  -  romant dinner
mexican fiesta recipees  -  mexican fiesta recip
a modern icebox cake  -  modern icebox cake
food for diwali  -  food diwali
i want a fancy eggs recipe  -  fanci egg