In [4]:
# python -m spacy download en_core_web_sm

import spacy
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from spellchecker import SpellChecker

In [5]:
df_cooking = pd.read_csv("queries/cooking.csv")

In [18]:
def stopword_removal(text):
    my_file = open("stopwords.txt", "r")
    stopword_list = my_file.read().split("\n")
    # print("stopwords")
    # print(stopword_list)
    my_file.close()

    tokens = []
    # tokenization
    doc = nlp(text, disable=["tagger", "parser", "attribute_ruler", "tok2vec", ])
    # stop word removal
    for t in doc:
        if not t.is_stop and not t.is_punct and not t.is_space:
            if not str(t) in stopword_list:
                tokens.append(t.lemma_.lower())
    return tokens

def stemming(tokens):
    stemmer = SnowballStemmer(language='english')
    return [stemmer.stem(token) for token in tokens]

def spell_check(tokens):
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    words = []
    # print("misspelled", misspelled)
    for word in tokens:
        if word in misspelled:
            if not spell.correction(word) is None:
                words.append(spell.correction(word))
            else:
                words.append(word)
        else:
            words.append(word)
    return words

def key_words(tokens):
    tokens = nlp(" ".join(tokens))
    # for t in tokens:
    #     print(t.text, t.pos_)
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB', '']
    return [token.text for token in tokens if token.pos_ in pos_tag]

def preprocess_query(query):    
    tokens = stopword_removal(query)
    # print("stop", tokens)
    tokens = spell_check(tokens)
    # print("spell", tokens)
    tokens = stemming(tokens)
    # print("stem", tokens)
    tokens = key_words(tokens)
    # print("key", tokens)
    parsed_query = " ".join(tokens)
    # print(query, " - ", parsed_query)
    return parsed_query

In [19]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
df_cooking["parsed query"] = df_cooking["raw query"].apply(lambda x: preprocess_query(x))