In [747]:
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec

import re

In [912]:
text_df = pd.read_csv('./dataset/clean_text.csv', index_col=0)
pos_text_df = pd.read_csv('./dataset/positive_text.csv', index_col=0)
grouped_text_df = pd.read_csv('./dataset/grouped_text.csv', index_col=0)
grouped_pos_text_df = pd.read_csv('./dataset/grouped_positive_text.csv', index_col=0)
menu_desc_df = pd.read_csv('./dataset/menu_desc_df.csv', index_col=0)
menu_title_df = pd.read_csv('./dataset/menu_title_df.csv', index_col=0)

In [1085]:
# Utilize helper functions
%run helper_function.ipynb

## Introduction

In this machine learning file, I'll be creating three Doc2Vec models to represent words and documents into numerical vectors. Each Doc2Vec model will be built and trained using Yelp's review/tips, menu description, and menu titles. The idea behind building three Doc2Vec models is to consolidate search results which consists of most similar documents based on the user's search query using cosine similarity and built-in Doc2Vec's most_similar method. The top 10 documents will be chosen in one of following ways:

1. When there are common documents populated in all three results; the document will be added as part of the final result. For instance, when a restaurant named 'Panda Express' is shown in all three Doc2Vec results, Panda Express will be shown as part of the final result visible to the user.
    - When similar restaurant names do not accumulate up to 10; the rest will be determined based on cosine similarity score.
2. When no common documents are found through three Doc2Vec results, all three results will be consolidated and results will be shown based on cosine similarity score in descending order.

Below is overview of NLP search engine's control flow:

<img src="imgs/search_overview_img.png" alt="control_flow" style="width: 50%;"/>

### Resolving NaNs on menu dataframes

In [1002]:
menu_title_df.loc[menu_df['clean_menu_titles'].isna(), 'clean_menu_titles'] = menu_title_df['name']

In [1004]:
menu_desc_df.dropna(inplace=True)

## Datasets
Datasets to be used in building doc2vec and TF-IDF models.

### Menu description
Menu description dataset contains menu description that describes food and drinks.

In [996]:
menu_desc_df

Unnamed: 0,name,clean_menu_desc,popularity_score
0,cabo fish taco,pipe hot melted cheese blend chipotle pepper s...,92.00
1,cabo fish taco,a house blend avocado lime juice cilantro sign...,92.00
2,cabo fish taco,beer batter calamari ring flash fry serve roas...,92.00
3,cabo fish taco,creamy dip lump crabmeat seasoned shrimp serve...,92.00
4,cabo fish taco,roasted red tomato salsa fresh pineapple mango...,92.00
...,...,...,...
313948,fuji,chicken,83.61
313949,fuji,filet mignon,83.61
313950,fuji,shrimp scallops,83.61
313951,fuji,steak chicken,83.61


### Menu title
Menu title dataset contains food and drink items each restaurants are serving.

In [4]:
menu_title_df.head()

Unnamed: 0,name,clean_menu_titles,popularity_score
0,cabo fish taco,queso dip,92.0
1,cabo fish taco,guacamole,92.0
2,cabo fish taco,crispy calamari,92.0
3,cabo fish taco,baja shrimp crab dip,92.0
4,cabo fish taco,salsa trio,92.0


### Grouped positive review texts
Grouped positive review texts' dataset contains consolidated positive review texts per restaurant.

In [5]:
grouped_pos_text_df.head()

Unnamed: 0,name,clean_text,popularity_score
0,#1 pho,fantastic pho vegetable pho vegetable broth fr...,78.0
1,24th street pizza & gyros,gyro pizza pretty fantastic thing keep come de...,80.0
2,3 tomatoes & a mozzarella,absolutely delicious customer go today single ...,76.19
3,4b cafe,good consensus average home fairly easily emul...,100.0
4,5 r cha thai go,yum yum yum try dynamite fried rice amazing fi...,83.0


### Grouped review texts
Grouped review texts' dataset contains consolidated positive and negative review texts per restaurant.

In [7]:
grouped_text_df.head()

Unnamed: 0,name,clean_text,popularity_score
0,#1 pho,fantastic pho vegetable pho vegetable broth fr...,78.0
1,24th street pizza & gyros,change ownership stop day advertise pizza slic...,80.0
2,3 tomatoes & a mozzarella,absolutely delicious customer go today single ...,72.52
3,4b cafe,good consensus average home fairly easily emul...,100.0
4,5 r cha thai go,try r cha today turn permanently close very sa...,83.0


### Positive review texts
Positive review texts' dataset contains positive review texts with duplicated restaurant list.

In [8]:
pos_text_df.head()

Unnamed: 0,name,clean_text,popularity_score
1,pink taco,holy heck chicken taco far favorite great cust...,87.0
2,vito's pizza,be chicago style deep dish homemade type crust...,91.0
3,lao laan-xang restaurant,easily favorite madison great lao thai curry f...,95.0
4,rosati's pizza,more pizza this location small cozy have large...,71.0
5,bouchon,bouchon favorite hand extremly fresh eat bouch...,93.0


### Review texts
Review texts' dataset contains both positive and negative review texts with duplicated restaurant list.

In [11]:
text_df.head()

Unnamed: 0,name,clean_text,popularity_score
0,pink taco,p.m. super bowl sunday close weak wonder hard ...,87.0
1,pink taco,holy heck chicken taco far favorite great cust...,87.0
2,vito's pizza,be chicago style deep dish homemade type crust...,91.0
3,lao laan-xang restaurant,easily favorite madison great lao thai curry f...,95.0
4,rosati's pizza,more pizza this location small cozy have large...,71.0


### Attributes
Attributes' dataset contains restaurants' attributes information such as cuisine and other restaurant related attributes.

In [12]:
attributes_df.head()

Unnamed: 0,name,chinese,caribbean,bakery,salads,wraps,spanish,sushi,californian,mediterranean,...,noodles,turkish,cocktails,irish,bar,south,crepes,french,british,creole
0,#1 pho,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24th street pizza & gyros,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3 tomatoes & a mozzarella,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4b cafe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5 r cha thai go,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Creating Tagged Documents using namedtuples for each datasets
Tagged Document is a single document, made up of words (a list of unicode string tokens) and tags (a list of tokens). It represents a document along with a tag which is required as input for Doc2Vec. Utilizing namedtuples to also include restaurant names and popularity scores for tf-idf search model.

#### Gather all text and menu information

In [1006]:
# Enusre they're all string datatype
text_df['clean_text'] = text_df['clean_text'].astype(str)
pos_text_df['clean_text'] = pos_text_df['clean_text'].astype(str)
grouped_text_df['clean_text'] = grouped_text_df['clean_text'].astype(str)
grouped_pos_text_df['clean_text'] = grouped_pos_text_df['clean_text'].astype(str)

menu_desc_df['clean_menu_desc'] = menu_desc_df['clean_menu_desc'].astype(str)
menu_title_df['clean_menu_titles'] = menu_title_df['clean_menu_titles'].astype(str)

In [1007]:
# Create Tagged documents with additional information such as restaurant name and popularity score,
text_docs = create_doc_tuples(text_df)
# pos_text_docs = create_doc_tuples(pos_text_df)
# grouped_text_docs = create_doc_tuples(grouped_text_df)
# grouped_pos_text_docs = create_doc_tuples(grouped_pos_text_df)
menu_desc_docs = create_doc_tuples(menu_desc_df, 'menu_desc')
menu_title_docs = create_doc_tuples(menu_title_df, 'menu_title')

In [100]:
# Sample output
grouped_text_docs[:1]

[Document(name='#1 pho', words=['fantastic', 'pho', 'vegetable', 'pho', 'vegetable', 'broth', 'fresh', 'fabulous', 'do', 'miss', 'vegetable', 'spring', 'roll', 'come', 'yesterday', 'afternoon', 'pho', 'as', 'walk', 'strong', 'scent', 'incense', 'fan', 'however', 'wall', 'separate', 'dining', 'room', 'area', 'create', 'problem', 'seat', 'look', 'owner', 'super', 'friendly', 'order', 'typical', 'pho', 'rare', 'beef', 'fantastic', 'definitely', 'pho', 'restaurant', 'alot', 'bubble', 'tea', 'menu', 'want', 'way', 'the', 'downside', 'water', 'green', 'inside', 'fish', 'tank', 'unfortunately', 'stare', 'entire', 'eat', 'regardless', 'town', 'the', 'beef', 'noodle', 'pho', 'adequate', 'srichacha', 'available', 'supplement', 'thimbleful', 'provide', 'request', 'the', 'fresh', 'roll', 'contain', 'turkey', 'shrimp', 'interesting', 'combo', 'work', 'the', 'taro', 'boba', 'arrive', 'parfait', 'glass', 'look', 'beautiful', 'break', 'tooth', 'bubble', 'hardened', 'tapioca', 'taro', 'sweet', 'need', 

## Build Doc2Vec Models

In [1008]:
# Menu title and description
menu_desc_model = Doc2Vec(dm=1, # CBOW algorithm
                size=100, # 100 vector dimensions
                window=2, # maximum distance between the predicted word and context words
                hs=0, # negative sampling - remov
                dbow_words=1, # trains word vectors in skip-gram fashion
                sample=1e-5, # Threshold for deleting high frequency word like 'the'
                workers=4) # Worker threads

menu_title_model = Doc2Vec(dm=1,size=100, window=2, hs=0, dbow_words=1, sample=1e-5, workers=4)

In [1009]:
# Increasing window size to 10 since review texts has longer sentences compared to menu text values
text_model = Doc2Vec(dm=1,size=100, window=10, hs=0, dbow_words=1, sample=1e-5, workers=4)
pos_text_model = Doc2Vec(dm=1,size=100, window=10, hs=0, dbow_words=1, sample=1e-5, workers=4)

In [106]:
# Increased window size to 25 since grouped text values contains 'combined' review texts per restaurant
grouped_text_model = Doc2Vec(dm=1,size=100, window=25, hs=0, dbow_words=1, sample=1e-5, workers=4)
grouped_pos_text_model = Doc2Vec(dm=1,size=100, window=25, hs=0, dbow_words=1, sample=1e-5, workers=4)

### Build Vocabulary

In [1010]:
# Menu title and description
menu_desc_model.build_vocab(menu_desc_docs)
menu_title_model.build_vocab(menu_title_docs)

# Review text
text_model.build_vocab(text_docs)
# pos_text_model.build_vocab(pos_text_docs)
# grouped_text_model.build_vocab(grouped_text_docs)
# grouped_pos_text_model.build_vocab(grouped_pos_text_docs)

### Train Doc2Vec Models

In [1011]:
menu_desc_model.train(menu_desc_docs, 
                      total_examples=menu_desc_model.corpus_count, 
                      epochs=100, start_alpha=0.01, end_alpha=0.01)

In [1012]:
menu_desc_model.save('./dataset/menu_desc_model')

In [1015]:
menu_title_model.train(menu_title_docs, 
                      total_examples=menu_title_model.corpus_count, 
                      epochs=100, start_alpha=0.01, end_alpha=0.01)

In [1016]:
menu_title_model.save('./dataset/menu_title_model')

In [1017]:
text_model.train(text_docs, 
                      total_examples=text_model.corpus_count, 
                      epochs=100, start_alpha=0.01, end_alpha=0.01)

In [1018]:
text_model.save('./dataset/text_model')

In [285]:
pos_text_model.train(pos_text_docs, 
                      total_examples=text_model.corpus_count, 
                      epochs=100, start_alpha=0.01, end_alpha=0.01)

In [286]:
pos_text_model.save('./dataset/pos_text_model')

In [287]:
grouped_text_model.train(grouped_text_docs, 
                      total_examples=grouped_text_model.corpus_count, 
                      epochs=100, start_alpha=0.01, end_alpha=0.01)

In [288]:
grouped_text_model.save('./dataset/grouped_text_model')

In [289]:
grouped_pos_text_model.train(grouped_pos_text_docs, 
                      total_examples=grouped_pos_text_model.corpus_count, 
                      epochs=100, start_alpha=0.01, end_alpha=0.01)

In [290]:
grouped_pos_text_model.save('./dataset/grouped_pos_text_model')

### Evaluating words embedding and insert vectors to its respective rows

#### Menu title

In [1019]:
menu_title_model.wv.most_similar_cosmul(positive=['pasta'])

[('sharing', 0.9621393084526062),
 ('primavera', 0.9609645009040833),
 ('meatball', 0.9516516327857971),
 ('bucatini', 0.9490647912025452),
 ('mother', 0.9486719965934753),
 ('spaghetti', 0.9466715455055237),
 ('pesto', 0.9466609358787537),
 ('duo', 0.9392339587211609),
 ('romano', 0.9383900761604309),
 ('bolognese', 0.9362183213233948)]

In [1061]:
# Set random seed to ensure same results
menu_title_model.random.seed(42)
menu_title_df['vector'] = menu_title_df['clean_menu_titles'].str.split().apply(menu_title_model.infer_vector).values

#### Menu description

In [1020]:
menu_desc_model.wv.most_similar_cosmul(positive=['salmon'])

[('tuna', 0.8421205878257751),
 ('yellowtail', 0.8276487588882446),
 ('sushi', 0.8268221020698547),
 ('tobiko', 0.8193127512931824),
 ('ikura', 0.8125702142715454),
 ('eel', 0.8111112713813782),
 ('sashimi', 0.8053328394889832),
 ('masago', 0.8043803572654724),
 ('collar', 0.8023239970207214),
 ('whitefish', 0.8009525537490845)]

In [1060]:
# Set random seed to ensure same results
menu_desc_model.random.seed(42)
menu_desc_df['vector'] = menu_desc_df['clean_menu_desc'].str.split().apply(menu_desc_model.infer_vector).values

#### All review texts

In [1021]:
test = text_model.wv.most_similar_cosmul(positive=['pho'])
test

[('vietnamese', 0.8965592384338379),
 ('vermicelli', 0.8624076843261719),
 ('biet', 0.8565489053726196),
 ('viet', 0.8530400395393372),
 ('saigon', 0.8467001914978027),
 ('dac', 0.8427303433418274),
 ('banh', 0.8308336734771729),
 ('phos', 0.8118297457695007),
 ('bahn', 0.8098064064979553),
 ('thanh', 0.8051256537437439)]

In [1059]:
# Set random seed to ensure same results
text_model.random.seed(42)
text_df['vector'] = text_df['clean_text'].str.split().apply(text_model.infer_vector).values

#### Positive text

In [291]:
pos_text_model.wv.most_similar_cosmul(positive=['indian'])

[('india', 0.9340745806694031),
 ('dhaba', 0.8638657927513123),
 ('tikka', 0.8469441533088684),
 ('biryani', 0.8339383006095886),
 ('taj', 0.8307921886444092),
 ('korma', 0.830226719379425),
 ('nepalese', 0.8180621266365051),
 ('delhi', 0.8172544836997986),
 ('masala', 0.8163518309593201),
 ('darbar', 0.8157666325569153)]

#### Grouped text values

In [292]:
grouped_text_model.wv.most_similar_cosmul(positive=['sandwiches'])

[('baguette', 0.7447547912597656),
 ('sandwich', 0.7340155839920044),
 ('lee', 0.7246848940849304),
 ('banh', 0.716722846031189),
 ('lees', 0.708283543586731),
 ('sourdough', 0.7040324211120605),
 ('sugarcane', 0.7029849290847778),
 ('durian', 0.7017142176628113),
 ('viet', 0.6917997598648071),
 ('mi', 0.6911283135414124)]

#### Grouped positive text values

In [293]:
grouped_pos_text_model.wv.most_similar_cosmul(positive=['bbq'])

[('pork', 0.7782501578330994),
 ('barbeque', 0.7593139410018921),
 ('barbecue', 0.7270077466964722),
 ('riblet', 0.7267343997955322),
 ('ribs', 0.7232769727706909),
 ('kbbq', 0.7131699919700623),
 ('shilla', 0.7110636830329895),
 ('korean', 0.7088062763214111),
 ('personel', 0.7043653130531311),
 ('rib', 0.7026166915893555)]

## TEST

In [1124]:
result_text_df = find_similar_docs('kimchi bulgogi', text_model, text_df, 'clean_text')

In [1125]:
result_text_df

Unnamed: 0,name,text,popularity_score,similarity_score
0,honey pig,korean,82.0,0.8697
1,green pepper,korean pittsburgh,90.0,0.865493
2,good fella korean bistro,favorite korean,96.0,0.84514
3,pei wei,spicy korean regular,71.0,0.810071
4,manna korean bbq,favorite find far vegas korean comfort bulgogi,98.0,0.806214
5,korea house,delicious menu dukboki kalbee bibim bap sure soju,90.0,0.766114
6,korea garden,cold noodle pricey stuff half korean,84.0,0.739822
7,shilla korean bbq,grill kimchi trust,88.0,0.693191
8,paris baguette,trays wax paper tong right inside,89.0,0.685871
9,yoshinoya,quick suki yaki type beef teriyaki chicken ext...,80.0,0.664226


In [1126]:
result_menu_desc_df = find_similar_docs('kimchi bulgogi', menu_desc_model, menu_desc_df, 'clean_menu_desc')

In [1127]:
result_menu_desc_df

Unnamed: 0,name,text,popularity_score,similarity_score
0,sushi rock,beef bulgogi,81.98,0.952456
1,manna korean bbq,bulgogi dakbulgogi salmon,98.0,0.951221
2,one world pizza,gochujan bulgogi kimchi green onion,92.0,0.94632
3,ginza sushi,kimchi bulgogi fried rice,86.0,0.945349
4,yen sushi & sake bar,bulgogi rice bowl lunch,93.0,0.914262
5,sushi 101,beef bulgogi bowl,86.51,0.912178
6,caffe bene,bulgogi beef little gimbap,89.0,0.893038
7,fuji japanese restaurant,chicken yakitori,81.25,0.879037
8,sichuan palace,sa chi pork,83.0,0.874797
9,hunan park,ta chin chicken,90.0,0.873308


In [1128]:
result_menu_title_df = find_similar_docs('kimchi bulgogi', menu_title_model, menu_title_df, 'clean_menu_titles')

In [1129]:
result_menu_title_df

Unnamed: 0,name,text,popularity_score,similarity_score
0,manna korean bbq,bulgogi jeongol,98.0,0.98953
1,green pepper,bulgogi,90.0,0.982826
2,island style,bulgogi,100.0,0.978813
3,korea garden,bulgogi,84.0,0.977044
4,shilla korean bbq,bulgogi,88.0,0.972034
5,yoshinoya,kimchi,79.61,0.968897
6,korea house,kimchi jigae,90.0,0.963814
7,good fella korean bistro,kimchi,94.7,0.95418
8,ginza sushi,spicy pork bulgogi hot stone bibimbap,86.0,0.949773
9,oishii bento,bulgogi bowl,95.0,0.936525


### Finalized Search Method
- Search method includes the following methods:
    - **find_similar_docs method**: Finds similar documents based on vectors.
    - **get_similar_list method**: Finds and gathers documents that has specific keywords associated with query.
    - **expand_contractions method**: Expands contractions.
    - **Tokenize method**: Text preprocessing.

In [1185]:
# Utilize helper functions
%run helper_function.ipynb

In [1189]:
models = {'text': text_model, 'desc': menu_desc_model, 'title': menu_title_model}
dfs = {'text': text_df, 'desc': menu_desc_df, 'title': menu_title_df}

final_result_df = search('Indian Chicken Makhani', models, dfs)
final_result_df

Unnamed: 0,name,text,popularity_score,similarity_score
0,taste of india,chicken makhani,87.15,0.955585
1,bombay indian grill,shrimp makhani,90.35,0.953731
2,bombay grill,chicken makhani,85.54,0.949027
3,passage to india,vegetable makhani,88.14,0.944064
4,bawarchi indian cuisine,chicken makhani,74.03,0.943634
5,india palace,mushroom mattar makhani,89.71,0.943534
6,india palace,chicken makhani plus appear check offer,89.0,0.938943
7,dhaba indian bistro,choice indian fact indian highly recommend chi...,95.0,0.884755
8,passage to india,chicken saag,87.0,0.804523
9,star of india,year enjoy finally think write review especial...,92.0,0.79438


## TF-IDF Model

In [234]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim import similarities 

dct = Dictionary(doc.words for doc in text_docs)  # fit dictionary
corpus = [dct.doc2bow(doc.words) for doc in text_docs]  # convert dataset to BoW format
model_tfidf = TfidfModel(corpus)  # fit model

In [235]:
# Sample output
index = 0
print("Document ",index, " : ", text_data[text_docs[index].index] ) 
print("Bag of words representation of document ", index, " : ", corpus[index])

Document  0  :  p.m. super bowl sunday close weak wonder hard rock die
Bag of words representation of document  0  :  [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


## Similarity matrix between a list of key words

In [282]:
#Create similarity matrix between doc and tokens

# Preprocess query
query = expand_contractions("korean bulgogi food") # to ensure vocab uniformity with w2v vocab
query = tokenize(query) # lowercase, lemmatizes, and removes stop word
query = re.sub(' +', ' ', query) # Remove extra whitespace

tokens = query.split()
index = similarities.MatrixSimilarity([dct.doc2bow(tokens)],num_features=len(dct))

In [283]:
similarity=np.zeros((len(text_docs))) # Create indices with default value as 0
maxsim = 0
for i, doc in enumerate(text_docs):
    similarity[i] = index[dct.doc2bow(doc.words)]

topn = np.argsort(-similarity)[:10] # Get top 10 indices that has highest value

In [284]:
names = []
words = []
pop_scores = []
sim_scores = []

for i in topn:
    #docsim = text_docs[np.argmax(similarity)]
    docsim = text_docs[i]
    
    names.append(docsim.name)
    words.append(' '.join(docsim.words))
    pop_scores.append(docsim.popularity_score)
    sim_scores.append(round(similarity[i], 2))

data = {'name': names, 'text': words, 'popularity_score': pop_scores, 'similarity_score': sim_scores}
tfidf_result_df = pd.DataFrame(data)

tfidf_result_df

Unnamed: 0,name,text,popularity_score,similarity_score
0,good fella korean bistro,this cute feel traditional korean bibimbap tas...,96.0,0.55
1,oishii bento,spicy pork bulgogi bulgogi way,95.0,0.53
2,oishii bento,use go pitt want affordable korean korean rest...,95.0,0.52
3,korea garden,favorite korean pittsburgh delicious decent ko...,84.0,0.51
4,honey pig,all korean,82.0,0.5
5,manna korean bbq,favorite find far vegas amazing korean comfort...,98.0,0.5
6,honey pig,all korean,82.0,0.5
7,sakana,korean sushi,93.0,0.5
8,good fella korean bistro,for small las vegas surprised honestly review ...,96.0,0.49
9,good fella korean bistro,very korean the kimchi pancake beef bulgogi su...,96.0,0.49
