In [5]:
# Essentials
import pandas as pd
import numpy as np
import math
import ast

In [10]:
# nltk - used for analyzing reviews dataframe
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# Spacy for lemmatization and stop words
import spacy
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [5]:
def create_attr_columns(business_df, column_name):
    '''
    Creates new columns based on attributes value.
    
    Parameters
    ----------
    business_df : pandas.DataFrame
        Yelp's business dataframe
    column_name : str
        Column names such as ambience, music, noise_level, etc.
    '''
    # Creating ambience columns (ex: casual, upscale, trendy, etc.)
    for row in business_df[business_df[column_name] != 0].itertuples():
        
        value = None
        
        # Need to go through this as row is a tuple
        if column_name == 'ambience':
            value = row.ambience
        elif column_name == 'noiselevel':
            value = row.noiselevel
        elif column_name == 'music':
            value = row.music
        elif column_name == 'goodformeal':
            value = row.goodformeal
        
        col_obj = ast.literal_eval(value) # convert str to dict
        val_keys = col_obj.keys() # get keys

        for val_key in val_keys: # create new ambience type columns when row value is set to true

            if col_obj[val_key] == True:
                business_df.loc[row.Index, val_key] = 1 

In [1]:
def set_review_count(series):
    '''
    Gets review rows based on given user id and retrieves and sets mean/min/max stars values to stars dataframe
    
    Parameter
    ---------
    series: pandas.core.series.Series
        closed review's date column
    Returns
    -------
    pandas.core.series.Series
        Returns pandas' series consisting of each business's # of reviews in the last 30/60/90/180/365, 
    '''
    latest_reviewed_date = pd.Timestamp(series.max())
    earliest_reviewed_date = pd.Timestamp(series.min())
    last_30 = 0
    last_60 = 0
    last_90 = 0
    last_180 = 0
    last_365 = 0
    
    for i in series:
        days = (latest_reviewed_date - pd.Timestamp(i)).days
        
        if days <= 30:
            last_30 += 1
        elif days <= 60:
            last_60 += 1
        elif days <= 90:
            last_90 += 1
        elif days <= 180:
            last_180 += 1
        elif days <= 365:
            last_365 += 1
        
    d = {}
    d['30_days_review_count'] = last_30
    d['60_days_review_count'] = last_60
    d['90_days_review_count'] = last_90
    d['180_days_review_count'] = last_180
    d['365_days_review_count'] = last_365
    d['lifespan'] = round((latest_reviewed_date - earliest_reviewed_date).days / 365, 2)
    return pd.Series(d, index=['30_days_review_count', '60_days_review_count', '90_days_review_count', 
                               '180_days_review_count', '365_days_review_count', 'lifespan'])

In [3]:
def getAverageValues(df):
    '''
    Gets average star and review count.
    Parameter
    ---------
    df: pandas.core.frame.DataFrame
        Business Dataframe or any dataframe consisting of star and review_count columns
    Returns
    -------
    Average star and review_count values
    '''
    avg_stars = round(df['adjusted_stars'].mean(),2)
    avg_review_count = round(df['review_count'].mean(), 2)
    avg_price = round(df['price'].mean(), 2)
    
    return (avg_stars, avg_review_count, avg_price)

In [5]:
def flatten_df(df):
    '''
    Reshapes business dataframe in preparation for scatter plot
    Parameter
    ---------
    df: pandas.core.frame.DataFrame
        Business dataframe consist of 30, 60, 90, 180, and 365 days of review counts
    Returns
    -------
    pandas.core.frame.DataFrame
        Returns reshaped dataframe consisting of visit counts and days columns
    '''
    dfs = []
    
    for column in close_df.columns[:5]:
        
        days = int(column.split('_')[0])
        new_df = pd.DataFrame({'visits': df[column], 'days': [days for x in df.index]})
        dfs.append(new_df)

    return pd.concat(dfs)

In [4]:
def create_lifespan_df(df):
    '''
    Creates business lifespan dataframe.
    
    Parameter
    ---------
    df: pandas.core.frame.DataFrame
        Closed or Open business dataframe
    Returns
    -------
    pandas.core.frame.DataFrame
        Returns pandas' dataframe consisting of each # of businesses per each lifespan columns (ex: 0 to 1 = 23) 
    '''
    data = {}
    years = []
    business_count = []
    
    for i in range(math.ceil(df['lifespan'].max())):
        filtered = df[(df['lifespan'] > i) & (df['lifespan'] < i+1)]
        num_of_business = len(filtered.index)
        years.append(f'{i} to {i+1}')
        business_count.append(num_of_business)
    
    data['years'] = years
    data['business_count'] = business_count

    return pd.DataFrame(data)

In [6]:
def create_dataframes(df, weekly_df):
    '''
    Filters based on closed and open businesses' lifespan and creates dataframes.
    
    Parameter
    ---------
    df: pandas.core.frame.DataFrame
        Closed and open business dataframes
    weekly_df: pandas.core.frame.DataFrame
        Weekly review dataframe
    Returns
    -------
    dict
        Returns dict with dataframes
    '''
    dict_of_df = {}
    
    for i in range(15):
        dict_of_df[i+1] = df[(df['lifespan'] > i) & (df['lifespan'] < i+1)].reset_index(drop=True)
        
    return create_revenue_df(dict_of_df, weekly_df)

In [7]:
def create_revenue_df(dict_of_df, weekly_df):
    '''
    Creates revenue dataframe (columns= years and revenues).
    
    Parameter
    ---------
    dict_of_df: Dictionary
        Dictionary filled with dataframes by business lifespan
    weekly_df: pandas.core.frame.DataFrame
        Weekly review dataframe
    Returns
    -------
    pandas.core.frame.DataFrame
        Returns revenue dataframe
    '''
    years = []
    revenues = []

    for key in dict_of_df.keys():
        df = dict_of_df[key]
        business_id = weekly_df[weekly_df['business_id'].isin(df['business_id'])]['business_id'].reset_index(drop=True)
        revenue = weekly_df[weekly_df['business_id'].isin(dict_of_df[key]['business_id'])]['revenue'].sum()
        revenues.append(revenue)
        years.append(int(key))

    lifespan_revenue_dict = {'years': years, 'revenue': revenues}
    return pd.DataFrame(data=lifespan_revenue_dict)

In [1]:
def analyse_sentiment(review_text):
    '''
    Aggregates # of check-ins in the last 30/60/90/180/365 days
    
    Parameter
    ---------
    review_text:  str
        review text from review dataframe
    Returns
    -------
    json.Object
        Returns object consisting of negative, neural, positive, and compound values 
    '''
    loc_score = analyser.polarity_scores(review_text)
    return loc_score

In [2]:
def tokenize(text):
    """
    Removes special characters\whitespaces, lowercases, tokenize, filter based on stop words 
    and lemmatize.

    Parameters
    ----------
    text : str
        Yelp review or tips text data
    Returns
    -------
    clean_text: str
        Returns tokenized text
    """
    tokens = nlp(text, disable=['parser', 'ner'])
    pos_tags = ['NOUN', 'VERB', 'ADJ', 'ADV'] # allowed parts of speech
    i = 0 # iterator to store lemmatized and clean text back to dataframe
    clean_tokens = []
    
    for token in tokens:
        
        if (not token.is_punct # no punctuation
            and not token.is_space # no whitespace
            and token.is_alpha # include alphabets
            and token.pos_ in pos_tags): # include noun, verb, adjectives, adverbs
            
            if str(token) not in stop_words: # no stop words   
                word = token.lemma_.strip().lower() # lemmatize, whitespace and lowercase
                clean_tokens.append(word)
                
    clean_text = ' '.join(clean_tokens) # re-create text from clean tokens
    return clean_text

In [7]:
# Features food related categories
target_cols = ['business_id','name','review_count','stars','price','caters','goodforkids','alcohol','goodforgroups',
               'tableservice','drivethru','outdoorseating','noiselevel','delivery','takeout','goodfordancing',
               'counterservice','casual','classy','hipster','trendy','touristy','romantic','intimate','divey',
               'upscale','lunch','dinner','brunch','breakfast','dessert','tiki bars','delis','portuguese',
               'cheese shops','gluten-free','american (traditional)','szechuan','pretzels','tasting classes',
               'swiss food','ethnic food','haitian','hawaiian','farmers market','sicilian','turkish',
               'candy stores','wine tasting room','distilleries','taiwanese','dive bars','cafeteria',
               'cantonese','izakaya','asian fusion','gelato','honduran','vietnamese','desserts',
               'fondue','pan asian','brewpubs','japanese','club crawl','rotisserie chicken','cupcakes',
               'moroccan','salvadoran','cheese tasting classes','greek','cajun/creole',
               'steakhouses','vegetarian','beer','kebab','seafood','tea rooms','malaysian','pasta shops',
               'food court','wineries','oaxacan','bars','pakistani',
               'seafood markets','scandinavian','ramen','arabian','coffeeshops','beverage store',
               'southern','chinese','belgian','polynesian','hainan','cocktail bars','uzbek','thai',
               'olive oil','french','food trucks','restaurants','sandwiches','waffles',
               'japanese curry','tapas bars','hot pot','halal','barbeque','bakeries',
               'indonesian','live/raw food','pub food','tapas/small plates','mexican',
               'cafes','bangladeshi','hungarian','iberian','coffee & tea','creperies','food stands',
               'catalan','buffets','pizza','fish & chips','meaderies','peruvian','acai bowls',
               'internet cafes','noodles','trinidadian','popcorn shops','burmese','chicken shop','soul food',
               'drive-thru bars','dinner theater','himalayan/nepalese','chimney cakes','macarons','cambodian',
               'dominican','fast food','tuscan','soba','hookah bars','beer tours','fruits & veggies',
               'beer gardens','piano bars','laotian','irish','pubs','personal chefs',
               'mediterranean','wine tours','dance clubs','bartenders','udon','food delivery services',
               'nicaraguan','whiskey bars','chicken wings','syrian','dim sum','hong kong style cafe','tempura',
               'korean','puerto rican','canadian (new)','hot dogs','hotel bar','lebanese','latin american',
               'american (new)','juice bars & smoothies','beach bars','kombucha','afghan','calabrian','brazilian',
               'australian','argentine','guamanian','specialty food','wine & spirits',
               'ethiopian','british','mongolian','breakfast & brunch','wine bars','bistros','sushi bars',
               'shaved ice','shanghainese','beer bar','cheesesteaks','filipino','custom cakes',
               'do-it-yourself food','salad','modern european','armenian','colombian','beer garden','kosher',
               'bubble tea','czech','honey','polish','brasseries','herbs & spices','nightlife',
               'burgers','champagne bars','smokehouse','austrian','wraps','supper clubs','gastropubs','tonkatsu',
               'russian','donuts','ukrainian','ayurveda','empanadas','scottish','italian','eritrean','vegan',
               'falafel','pancakes','cuban','speakeasies','irish pub','german','new mexican cuisine','indian',
               'senegalese','persian/iranian','dumplings','sports bars','tex-mex','african','signature cuisine',
               'breweries','egyptian','south african','sri lankan','conveyor belt sushi','bagels',
               'pop-up restaurants','delicatessen','tacos','diners','hotels',
               'comfort food','ice cream & frozen yogurt','bar crawl','cideries','caribbean','venezuelan']

In [3]:
venues = ['tiki bars','cheese shops','farmers market','candy stores','wine tasting room','distilleries',
          'dive bars','cafeteria','brewpubs','steakhouses','tea rooms','pasta shops','food court','wineries',
          'bars','seafood markets','coffeeshops','beverage store','cocktail bars','food trucks','restaurants',
          'tapas bars','barbeque','bakeries','cafes','internet cafes','popcorn shops','drive-thru bars',
          'dinner theater','fast food','hookah bars','beer gardens','piano bars','pubs','dance clubs',
          'whiskey bars','hong kong style cafe','hotel bar','beach bars','wine bars','bistros','sushi bars',
          'beer bar','beer garden','brasseries','nightlife','champagne bars','smokehouse','supper clubs',
          'gastropubs','speakeasies','irish pub','sports bars','breweries','pop-up restaurants','delicatessen',
          'diners','hotels','cideries']

In [2]:
# Cuisines
cuisines = ['american (traditional)','szechuan','swiss food','ethnic food','haitian','hawaiian','sicilian',
            'turkish','taiwanese','cantonese','izakaya','asian fusion','honduran','vietnamese','pan asian',
            'japanese','moroccan','salvadoran','greek', 'vegetarian','malaysian','oaxacan','pakistani',
            'scandinavian','arabian','southern','chinese','belgian','polynesian','hainan','uzbek','thai','french',
            'indonesian','mexican','bangladeshi','hungarian','iberian','catalan','peruvian','trinidadian',
            'burmese','himalayan/nepalese','cambodian','dominican','tuscan','laotian','irish','mediterranean', 
            'nicaraguan','syrian','korean','puerto rican','canadian (new)','lebanese','latin american',
            'american (new)','afghan','calabrian','brazilian','australian','argentine','guamanian','ethiopian',
            'british','mongolian','shanghainese','filipino','modern european','armenian','colombian','czech',
            'polish','austrian','russian','ukrainian','ayurveda','scottish','italian','eritrean','cuban','german',
            'new mexican cuisine','indian','senegalese','persian/iranian','tex-mex','african', 'egyptian',
            'south african','sri lankan','caribbean','venezuelan']