In [None]:
import pandas as pd

#### preparing menu items for modeling 


In [None]:
def clean_items_in_list(l, menu_stop_words = [], lemmatize=True):
    # ensures menu item lists are ready for processing: removes stop words, punctuation, digits etc. 
    stemmer = PorterStemmer()
    stop_words = list(ENGLISH_STOP_WORDS) + menu_stop_words
    if isinstance(l, list):
        clean_l = []
        for item in l:
            if isinstance(item, str):
                clean_item = "".join([c.lower() for c in item if c.isascii() or c == " "])
                clean_item = "".join([c for c in clean_item if not c.isdigit()])
                clean_item = " ".join([c.split(".")[-1] for c in clean_item.split() if c not in stop_words])
                clean_item = "".join([c.lower() for c in clean_item if c not in string.punctuation])
                clean_item = " ".join([c.strip() for c in clean_item.split()])
                if lemmatize:
                    clean_item = " ".join([stemmer.stem(c) for c in clean_item.split()])
                if clean_item != "" and clean_item != " ":
                    clean_l.append(clean_item)
        return clean_l
    else:
        return l

In [None]:
full_combo["Items Processed"] = full_combo["Items"].apply(clean_items_in_list)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, menu_rest_data["Cuisine - Least Frequent"], test_size=0.25, random_state=42)

In [None]:
model = RandomForestClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_true, y_pred)
# precision = precision_score(y_true, y_pred, average='weighted')  
# recall = recall_score(y_true, y_pred, average='weighted')
# f1 = f1_score(y_true, y_pred, average='weighted')

In [None]:
report = classification_report(y_test, y_pred)
print(report)

In [None]:
"https://www.nytimes.com/interactive/2017/12/12/dining/best-restaurant-nyc-pete-wells.html"

#### Bag of Words Analysis for Domain-Specific Stop Word Removal 

In [None]:
def make_item_corpus(df, item_column):
    item_corpus = []    
    for menu in df[item_column]: 
        if isinstance(menu, list):
            for item in menu: 
                item_corpus.append(item)
    corpus_lengths = []
    for item in item_corpus:
        corpus_lengths.append(len(item.split()))
    print("average item length: ", np.mean(corpus_lengths))
    print("median item length: ", np.median(corpus_lengths))
    return item_corpus

In [None]:
item_corpus = make_item_corpus(full_combo, "Cuisine")

In [None]:
bow = []
for item in item_corpus:
    for word in item.split():
        bow.append(word)

In [None]:
bow_freq = pd.DataFrame(bow).value_counts()
bow_freq = bow_freq.reset_index()
bow_freq.columns = ["word", "frequency"]
bow_freq["length"] = bow_freq["word"].apply(len)

In [None]:
bow_freq

In [None]:
percentiles_to_remove = 0.05
bow_freq.nlargest(int(len(bow_freq) * percentiles_to_remove), "frequency").sort_values(["length"], ascending=True)
## discussion: In this context, function words are helpful to determine the cuisine (unless they're in English). Thus, perhaps only single character "words" should be removed, if that; the two-letter words may be crucial for as they look like function words (such as "al", "en"). It is unclear if the single-letter words are typos or useful function words in languages other than English. The longer words also seem relevant - knowing if a place is serving "quesadilla", "cheeseburger" or "mediterranean" is very relevant. 
## Words that will be removed are: (1) single character words; (2) manually reviewed 1% words that have no relevance to the specific cuisine (e.g. "special"). Words that may appear in an n-gram (e.g. "sauce") will be seperated to test if they should be removed or not.   

In [None]:
percentiles_to_remove = 0.01
may_be_relevant = ["salad", "fri", "sauc", "soup", "hot", "ice", "sandwich"]
manual_list = ["special", "lunch", "platter", "delux", "combo", "fresh", "tray", "breakfast", "oz", "lb", "ml", "cl",
               "style", "slice", "mix", "juice", "soda", "piec", "chop", "bottl", "dinner", "cater", "hous", "larg",
               "aa"]
bow_freq.nlargest(int(len(bow_freq) * percentiles_to_remove), "frequency")

In [None]:
percentiles_to_remove = 0.4
bow_freq.nsmallest(int(len(bow_freq) * percentiles_to_remove), "frequency")
# discussion: all words that only appear once have no value - since they are essentially outliers. 

In [None]:
infrequent_words_to_remove = list(bow_freq.loc[bow_freq["frequency"] < 3]["word"])
single_letter_words = list(bow_freq.loc[bow_freq["length"] == 1]["word"])
words_to_remove = single_letter_words + infrequent_words_to_remove + manual_list + may_be_relevant

In [None]:
menu_rest_data["Items Processing"] = menu_rest_data["Items Processing"].apply(
    lambda x: clean_items_in_list(x, words_to_remove, lemmatize=False))

In [None]:
menu_rest_data["Items Processing - String"] = menu_rest_data["Items Processing"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
menu_rest_data["Items Processing - String"] = menu_rest_data["Items Processing - String"].str.replace(",", "")

In [None]:
menu_rest_data
# menu_rest_data["Items Processing - String"] 