In [13]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.similarities import MatrixSimilarity
from gensim.models import CoherenceModel, TfidfModel, LdaMulticore, Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

import random 
import re

# Ignore warning messages
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Spacy for lemmatization and stop words
import spacy
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Add additional stop words to Spacy
custom_stop_words = ['good', 'great', 'love', 'eat', 'try', 'amazing', 'come', 'food', 'place', 'order', 
                     'service', 'time', 'definitely', 'outstanding', 'restaurant', 'like', 'get', 'nice', 
                     'go', 'excellent', 'serve', 'sauce', 'bad', 'price']

for word in custom_stop_words:
    stop_words.add(word)

In [5]:
CONTRACTION_DICT = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [43]:
def create_tfidf_matrix_df(df, vocab=None):
    '''
    Creates tf-idf matrix dataframe.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Yelp's review or tip dataframe
    vocab (optional): list
        list of words used for defining vocabular in CountVectorizer
    Returns
    -------
    dtm_tfidf_df : pandas.DataFrame
        tf-idf dataframe with Yelp's business id as row index
    '''
    
    # Instantiation
    if vocab != None:
        vectorizer = CountVectorizer(min_df=.01, # min_df - ignore terms that appear less than 1% of the documents
                                     max_df=0.5, # max_df - ignore terms that appear in more than 65% of documents
                                     vocabulary=vocab) # ngram_range - consider unigrams, bigrams, trigrams and so forth
    else:
        vectorizer = CountVectorizer(min_df=.1, # min_df - ignore terms that appear less than 1% of the documents
                                     max_df=0.5, # max_df - ignore terms that appear in more than 65% of documents
                                     ngram_range=(1,3)) # ngram_range - consider unigrams, bigrams, trigrams and so forth
    
    tfidf_transformer = TfidfTransformer()
    
    X = vectorizer.fit_transform(df['item_list'])
    vocab = vectorizer.get_feature_names()

    # Create document term matrix dataframe
    dtm_df = pd.DataFrame(X.toarray(), index = df['cuisine'], columns = vocab)

    # Create tfidf matrix
    X_tfidf = tfidf_transformer.fit_transform(dtm_df)
    dtm_tfidf_df = pd.DataFrame(X_tfidf.toarray(), index = df['cuisine'], columns = vocab)
    
    return dtm_tfidf_df

In [15]:
def collect_menu_links(location_list):
    '''
    Collects menu links from allmenus.com - which is collected using state.
    
    Parameters
    ----------
    location_list: list of objects
        List of objects containing state and cities per object.
    Returns
    -------
    links: list of str
        allmenus' url links
    rest_titles: list of str
        Restaurant name/title
    '''
    cuisines = ['greek', 'south-american', 'filipino', 'indian', 'jamaican', 'spanish', 'italian', 'mexican', 
                'chinese', 'british-traditional', 'thai', 'vietnamese', 'brazilian', 'french', 'japanese', 'irish', 
                'korean', 'moroccan', 'russian']
    
    links = [] # Note: list for restaurant links
    cuisine_list = []
    rest_titles = []
    
    for location in location_list:
        
        state = location['state']
        cities = location['cities']

        for city in cities:

            # Get restaurants based on cuisines
            for cuisine in cuisines:

                page = requests.get(f'https://www.allmenus.com/{state}/{city}/-/{cuisine}/')
                soup = BeautifulSoup(page.content, 'html.parser')

                # Get all restaurant titles and cuisine list
                h4_elems = soup.find_all('h4', class_='name')
                p_elems = soup.find_all('p', class_='cousine-list')

                i = 0 # iterator for p tags in getting cuisine list

                # Loop through all h4 tags for child a tag's hrefs
                for elem in h4_elems: 

                    a_elem = elem.find_all('a')[0] # Get child a tag
                    cuisine_type = p_elems[i].getText() # Get cuisine type

                    link = a_elem.get('href') # Get href
                    links.append(f'{link}%{cuisine_type}')
                    rest_titles.append(a_elem.getText())
                    i += 1

    # Remove duplicate restaurants
    return (links, rest_titles)

In [17]:
def build_menu_df(links, rest_titles):
    '''
    Builds menu dataframe
    
    Parameters
    ----------
    links: list of str
        allmenus' url links to restaurant menu list
    rest_titles: list of str
        Restaurant name/title
    Returns
    -------
    menu_df: pandas.DataFrame
        DataFrame with restaurant name, menu title, menu description, and categories columns
    '''
    # Collect menu data per restaurant
    ids = []
    rest_names = []
    categories = []
    menu_titles = []
    menu_desc = []

    j = 0 # Iterator for retrieving cuisine list
    
    # Looping through each restaurant
    for link in links:

        # Retrieve city
        state = link.split('/')[1]
        start = f'/{state}/'
        end = '/'
        city = link.split(start)[1].split(end)[0]

        # Get restauarnt id
        start = f'/{state}/{city}/'
        end = '-'
        rest_id = int(link.split(start)[1].split(end)[0])

        # Get food categories - 'asian, american, indian etc.'
        idx = link.find('%')
        category = link[idx + 1:]

        # Remove category at the end of the link
        link = link[:idx] 

        # Get restaurant page
        page = requests.get(f'https://www.allmenus.com{link}')
        soup = BeautifulSoup(page.content, 'html.parser')

        # Get restaurant name
        rest_name = rest_titles[j]
        j += 1

        # Get menu items and its descriptions
        item_titles = soup.find_all('span', class_='item-title')
        item_desc = soup.find_all('p', class_='description')
        i = 0 # iterator for restaurant ids

        # Loop menu titles
        for item_title in item_titles:

            # Add restaurant id, name, categories, menu title, menu description
            ids.append(rest_id)
            rest_names.append(rest_name)
            categories.append(category)
            menu_titles.append(item_title.getText())
            
            if len(item_desc) != 0:
                menu_desc.append(item_desc[i].getText())
            i +=1
    
    # Create menus dataframe with collected data
    d = {'id': ids, 'name': rest_names, 'menu_titles': menu_titles, 'menu_desc': menu_desc, 'categories': categories}
    menu_df = pd.DataFrame(data=d)
    
    return menu_df

In [8]:
def expand_contractions(text, contraction_dict=CONTRACTION_DICT):
    """
    Expands contractions. For example, "y'all can't" => "you cannot"

    Parameters
    ----------
    text : str
        Text data
    Returns
    -------
    expanded_text: str
        Returns expanded text
    """
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_dict.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_dict.get(match)\
                                if contraction_dict.get(match)\
                                else contraction_dict.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

In [4]:
def tokenize(text):
    """
    Removes special characters\whitespaces, lowercases, tokenize, filter based on stop words 
    and lemmatize.

    Parameters
    ----------
    text : str
        Yelp review or tips text data
    Returns
    -------
    clean_text: str
        Returns tokenized text
    """
    tokens = nlp(text, disable=['parser', 'ner'])
    #pos_tags = ['NOUN', 'VERB', 'ADJ', 'ADV'] # allowed parts of speech
    i = 0 # iterator to store lemmatized and clean text back to dataframe
    clean_tokens = []
    
    for token in tokens:
        
        if (not token.is_punct # no punctuation
            and not token.is_space # no whitespace
            and token.is_alpha): # include alphabets
            #and token.pos_ in pos_tags): # include noun, verb, adjectives, adverbs
            
            if str(token) not in stop_words: # no stop words

                word = token.lemma_.strip().lower() # lemmatize, whitespace and lowercase
                clean_tokens.append(word)
                
    clean_text = ' '.join(clean_tokens) # re-create text from clean tokens
    return clean_text

In [40]:
def get_bigrams_trigrams_list(text):
    '''
    Builds bigram and trigram models and returns list of bigrams and trigrams list
    
    Parameter
    ---------
    text:  pandas.Series
        Text values from Consolidated Dataframe (reviews, tips, menu ingredient, etc.)
    Returns
    -------
    bigram and trigram: list (str)
        list of bigrams and trigrams
    '''
    text_data = [] # Storing each tokens
    text.apply(lambda text: text_data.append(text.split(' '))); # Split by individual words
    
    # Build bigram and trigram models
    bigram_model, trigram_model = build_bigram_trigram_models(text_data)
    
    # Build bigrams and trigrams list
    bigrams_list, trigrams_list = build_bigram_trigram_lists(text_data, bigram_model, trigram_model)
    
    return bigrams_list, trigrams_list

In [21]:
def build_bigram_trigram_models(text_data):
    '''
    Builds bigram and trigram models
    
    Parameter
    ---------
    text_data:  list of tokens
        tokens from reviews/tips text column
    Returns
    -------
    bigram_model and trigram_model: gensim.models.phrases.Phrase
        bigram model and trigram model
    '''
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(text_data, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[text_data], threshold=100)  

    bigram_model = gensim.models.phrases.Phraser(bigram)
    trigram_model = gensim.models.phrases.Phraser(trigram)
    
    return bigram_model, trigram_model

In [22]:
def build_bigram_trigram_lists(text_data, bigram_model, trigram_model):
    '''
    Forms bigram and trigram list of lists
    
    Parameter
    ---------
    text_data:  list of tokens
        tokens from reviews/tips text column
    Returns
    -------
    bigrams: list of lists (str)
    trigrams: list of lists (str)
        Returns text data consisting of bigrams and trigrams
    '''
    bigrams = []
    trigrams = []
    
    for text in text_data:
        bigrams.append(bigram_model[text])
        trigrams.append(trigram_model[bigram_model[text]])
    
    return bigrams, trigrams

In [23]:
def find_dominant_topics(lda_model, row, df):
    """
    Identifies dominant topic and its percentage contribution in each document.

    Parameters
    ----------
    lda_model: gensim.models.ldamulticore.LdaMulticore
        LDA model that holds all topics
    row: enumerate object
        list of tuples consisting of topic number and its contribution to the document
    df : pandas.DataFrame
        Reviews or Tips DataFrame
    
    Returns
    -------
    df: pandas.DataFrame
        Dataframe with dominant topic number, percentage contribution, and topic keywords
    """
    topics_df = pd.DataFrame() # Init dataframe
    df.reset_index(drop=True, inplace=True)
    
    for i, row_list in row: # Get main topic in each doc
        
        row = row_list[0] if lda_model.per_word_topics else row_list
        row = sorted(row, key=lambda x: (x[1]), reverse=True) # Arranging topics based on its contribution
        
        topic_num = row[0][0]
        prop_topic = row[0][1]
        words = lda_model.show_topic(topic_num)
        
        topic_keywords = ', '.join([word for word, prop in words])
        data = pd.Series([int(topic_num), round(prop_topic,3), topic_keywords])
        topics_df = topics_df.append(data, ignore_index=True)

    topics_df.columns = ['dominant_topic', 'percent_contribution', 'topic_keywords']
    topics_df = pd.concat([topics_df, df['clean_text']], axis=1)
    return topics_df

In [12]:
def compute_coherence_values(id2word, tfidf_corpus, text_data, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    id2word : Gensim dictionary
    tfidf_corpus : Gensim corpus
    text_data : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    lda_tuning_result_df: pandas.DataFrame consists of topics, alpha, beta, and coherence values
    """
    coherence_values = []
    tfidf_model_list = []
    
    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')

    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')
    
    # Gets model info
    model_results = {'Topics': [],
                     'Alpha': [],
                     'Beta': [],
                     'Coherence': []}
    
    for num_topics in range(start, limit, step):
        
        # iterate through alpha values
        for a in alpha:
            
            # iterare through beta values
            for b in beta:
                tfidf_lda_model = LdaMulticore(tfidf_corpus, # stream of document vectors 
                                       id2word=id2word, # mapping from word IDs to words
                                       num_topics=num_topics,  # number of requested latent topics to be extracted from the training corpu
                                       chunksize=100, # number of docs to be used in each training chunk
                                       alpha=a, 
                                       eta=b, 
                                       random_state=42, # to ensure same result
                                       eval_every=None, # don't evaluate model perplexity - takes too long
                                       passes=2, # number of passes through the corpus during training
                                       workers=4)
        
                tfidf_model_list.append(tfidf_lda_model)
                coherencemodel = CoherenceModel(model=tfidf_lda_model, texts=text_data, dictionary=id2word, coherence='c_v')
                coherence_value = coherencemodel.get_coherence()
                coherence_values.append(coherence_value)
                
                # Save the model results
                model_results['Topics'].append(num_topics)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(coherence_value)
                
    lda_tuning_result_df = pd.DataFrame(model_results)
                
    return tfidf_model_list, coherence_values, lda_tuning_result_df

In [25]:
def plot_doc_word_counts(df, doc_lens, ax_count, row_count, start_index=0):
    '''
    Plots distribution of document word counts.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Yelp's review or tip dataframe
    doc_lens: list
        list of word count per document
    ax_count: int
        # of axes per row
    row_count: int
        # of rows
    start_index: int
        starting index during loop
    '''
    colors = [color for name, color in mcolors.XKCD_COLORS.items()]
    random.shuffle(colors)

    #dpi = dots per inches - sharex/sharey - controls sharing of properties among x and y axes among all subplots 
    fig, axes = plt.subplots(ax_count, row_count, figsize=(15,10), dpi=100, sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten(), start_index): # flatten - flattens axes group into individual ax

        # Get individual topic dataframe
        sub_df = df.loc[df['dominant_topic'] == i, :]

        # Get word counts per document
        doc_lens = [len(text.split()) for text in sub_df['clean_text']]

        # Create histogram with 300 bins and specified random color
        ax.hist(doc_lens, bins=300, color=colors[i])

        # Set ticks on y-axis with set label color with specified random color
        ax.tick_params(axis='y', labelcolor=colors[i], color=colors[i])

        # Set kdeplot
        sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())

        ax.set(xlim=(0, 300), xlabel='Document Word Count')
        ax.set_ylabel('Number of Documents', color=colors[i])
        ax.set_title(f'Topic: {i}', fontdict=dict(size=15, color=colors[i]))

    fig.tight_layout()
    fig.subplots_adjust(top=0.90)
    plt.xticks(np.linspace(0,300,10))
    fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=15)
    plt.show()

In [7]:
def plot_wordcounts(df, ax_count, row_count, start_index=0):
    '''
    Plot Word Count and Weights of Topic Keywords.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe consisting of topic keyword, topic id, weight (importance), and word counts
    ax_count: int
        # of axes per row
    row_count: int
        # of rows
    start_index: int
        starting index during loop
    '''
    # Get random colors
    colors = [color for name, color in mcolors.XKCD_COLORS.items()]
    random.shuffle(colors)
    
    fig, axes = plt.subplots(ax_count, row_count, figsize=(12,7), sharey=True, dpi=100)

    for i, ax in enumerate(axes.flatten(), start_index):
        
        # Create bar graph - word count
        ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=colors[i], width=0.5, alpha=0.3, label='Word Count')
        ax_twin = ax.twinx() # share bar graph
        
        # Weightage (importance)
        ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=colors[i], width=0.2, label='Weights')
        ax.set_ylabel('Word Count', color=colors[i])
        
        # Set y-axis view limits
        ax.set_ylim(0, 5000000);
        ax_twin.set_ylim(0, 0.030); 

        # Set title
        ax.set_title(f'Topic: {i}', color=colors[i], fontsize=15)
        ax.tick_params(axis='y', left=False)
        
        # Tilt x-labels 30 deg
        ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment='right')
        
        # Set legend 
        ax.legend(loc='upper left'); 
        ax_twin.legend(loc='upper right')

    fig.tight_layout(w_pad=2)    
    fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=20, y=1.05)    
    plt.show()

In [9]:
def plot_wordcloud(df, lda_model, ax_count, row_count, start_index=0):
    '''
    Shows top ten words by weight per topic.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Yelp's review or tip dataframe
    lda_model: gensim.models.ldamulticore.LdaMulticore
        LDA model that holds all topics
    ax_count: int
        # of axes per row
    row_count: int
        # of rows
    start_index: int
        starting index during loop
    '''
    colors = [color for name, color in mcolors.XKCD_COLORS.items()]
    random.shuffle(colors)
    
    # instantiate word cloud
    word_cloud = WordCloud(stopwords=stop_words,# using spacy's stopwords
                           width=2500, # width of canvas
                           height=1800, # height of canvas
                           max_words=10, # Shows top 10 words by weight
                           color_func=lambda *args, **kwargs: colors[i], # Sets color
                           prefer_horizontal=1.0) # Horizontal fitting
    
    # formatted=false - returns 2 tuples of (word, probability)
    # num_topics=-1 - shows all topics
    topics = lda_model.show_topics(formatted=False, num_topics=-1) 
    
    fig, axes = plt.subplots(ax_count, row_count, figsize=(15,10), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten(), start_index):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        word_cloud.generate_from_frequencies(topic_words, max_font_size=300)  # Creates word cloud based on frequencies
        plt.gca().imshow(word_cloud) # imshow - display data as image
        plt.gca().set_title(f'Topic {i}', fontdict=dict(size=15)) # setting title
        plt.gca().axis('off') # removing axis


    plt.subplots_adjust(wspace=0, hspace=0)# remove spacing between axes
    plt.axis('off')
    plt.margins(x=0, y=0) # remove margins
    plt.tight_layout() # automatically adjusts padding between and around subplots
    plt.show()

In [6]:
def generate_wordcloud(df, cuisine, col):
    '''
    Shows top ten words by weight per topic.
    
    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe with text and menu values
    cuisine: str
        cuisine type (ex: japanese, american, etc.)
    col: str
        column name
    Returns:
    -------
    WordCloud: wordcloud.wordcloud.WordCloud
        WordCloud object with 50 common words
    '''
    colors = [color for name, color in mcolors.XKCD_COLORS.items()]
    random.shuffle(colors)
    
    vec = TfidfVectorizer(stop_words='english') # Instantiation
    vecs = vec.fit_transform(df[df[cuisine] == 1][col]) # Learns vocularly and returns vectors based on tf-idf
    
    feature_names = vec.get_feature_names() # Gets text value
    dense = vecs.todense() # Converts sparse matrix to dense matrix
    
    df = pd.DataFrame(dense.tolist(), columns=feature_names) # Create dataframe
    transposed_series = df.T.sum(axis=1) # Switch row index as column index - vice versa and get column sums
    
    return WordCloud(max_words=25, 
                     color_func=lambda *args, **kwargs: colors[0], # Sets color
                    ).generate_from_frequencies(transposed_series)

In [29]:
def get_dominant_topics_and_perc(lda_model, tfidf_corpus, start_index, end_index):
    '''
    Gets dominant topics and topics percentage list.
    
    Parameters
    ----------
    lda_model: gensim.models.ldamulticore.LdaMulticore
        LDA model that holds all topics
    corpus: gensim.interfaces.TransformedCorpus
        tfidf corpus
    start_index: int
        Starting index
    end_index: int
        Ending index
    Returns
    -------
    dominant_topics: list of dominant topics
        Topic id and dominant topic words
    topic_percentages: list of topic percentages
        Topic id and topic contribution percentage 
    '''
    corpus_sel = tfidf_corpus[start_index:end_index]
    dominant_topics = []
    topic_percentages = []
    
    for i, corp in enumerate(corpus_sel): # [(0, 0.09631027287834815), (1, 0.12703629397704447), ...]
        topic_percs, wordid_topics = lda_model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
        
    return(dominant_topics, topic_percentages)

In [30]:
def create_top_three_words_df(dominant_topics):
    '''
    Creates top three words per topic dataframe
    
    Parameters
    ----------
    lda_model: list of dominant topics
        Topic id and dominant topic words
    Returns
    -------
    dominant_topics: list of dominant topics
        Topic id and dominant topic words
    '''
    # Distribution of Dominant Topics in Each Document
    df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
    dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
    df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

    # Total Topic Distribution by actual weight
    topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
    df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

    # Top 3 Keywords for each Topic
    topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False) 
                                     for j, (topic, wt) in enumerate(topics) if j < 3]

    df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
    df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
    df_top3words.reset_index(level=0,inplace=True)

In [4]:
def get_top_n_words(x, index, n_gram):
    '''
    Returns top n words in unigram, bigram, or trigram.
    
    Parameters
    ----------
    x: pandas.Series
        Text values
    index: int
        Index which determines how many top n keywords we want
    n_gram: int
        Determines we want one or two-words or three-words 
    Returns
    -------
    words_freq: list of tuples
         List of tuples with text as key and value as count
    '''
    vec = CountVectorizer(ngram_range=(n_gram, n_gram), stop_words='english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:index]

In [1]:
def cos_similarity(a, b):
    '''
    Defines cosine similarity distance
    
    Parameters
    ----------
    a: list of dummy business attributes values (binary)
    b: list of business attributes values from business dataframe(binary)
    Returns
    -------
    sim: float
        cosine similarity distance
    '''
    nom = np.sum(np.multiply(a, b))
    denom = np.sqrt(np.sum(np.square(a))) * np.sqrt(np.sum(np.square(b)))
    sim = nom / denom
    return sim

In [4]:
def d2v_ranking_ir(query, model, df, model_type):
    '''
    Generates query result dataframe based on ranking information retrieval.
    
    Parameters
    ----------
    query: str
        Text value
    model: gensim.models.doc2vec.Doc2Vec
        Doc2Vec model
    df: pandas.DataFrame
        Cleaned text value dataframe consisting of restaurant information
    model_type: str
        text or menu type (defining which w2v model is used)
    Returns
    -------
    result_df: pandas.DataFrame
        Dataframe with restaurant name and cosine similarity score
    '''
    
    # Preprocess query
    query = expand_contractions(query) # to ensure vocab uniformity with w2v vocab
    query = tokenize(query) # lowercase, lemmatizes, and removes stop word
    query = re.sub(' +', ' ', query) # Remove extra whitespace
    
    # Generate vector
    vector = model.infer_vector(test_value.split())
    
    # Ranking documents
    if model_type == 'text':
        documents = df[['name', 'clean_text', 'popularity_score']].copy()
    elif model_type == 'menu_desc':
        documents = df[['name', 'clean_menu_desc', 'popularity_score']].copy()
    else:
        documents = df[['name', 'clean_menu_titles', 'popularity_score']].copy()
    
    # Applies cosine similarity after reshaping np array into one single nested list
    documents['similarity'] = df['vector'].apply(lambda x: cosine_similarity(np.array(vector).reshape(1,-1), 
                                                                             np.array(x).reshape(1, -1)).item())
    
    # Sort by similarity score
    documents.sort_values(by='similarity', ascending=False, inplace=True)
    
    return documents.reset_index(drop=True)

In [5]:
def search_categories(query):
    
    # Default dataframes
    text_result_df = None
    
    query = expand_contractions(query) # to ensure vocab uniformity with w2v vocab
    query = tokenize(query) # lowercase, lemmatizes, and removes stop word
    query = re.sub(' +', ' ', query) # Remove extra whitespace

    text_list = list(set(query.split()))
    dfs = []
    names = []
    
    # Only allow search when there is search term
    if len(text_list) > 0:

        # When there is keywors that matches
        for text in text_list:

            # Check if text is a key of categories dictionary
            if text in categories_dict:
                # Append to dfs list
                dfs.append(categories_dict[text])

        # When search term(s) matches one of restaurants' categories
        if len(dfs) > 0:

            print(len(dfs))
            result_df = None

            # Concat all dataframes
            if len(dfs) > 1:
                result_df = pd.concat(dfs)
            else:
                result_df = dfs[0]

            # Drop duplicates
            result_df = result_df.drop_duplicates(keep='first')
            result_df.reset_index(drop=True, inplace=True)

            names = result_df['name'].values # Get all restaurant names
    
    return names