In [1]:
import pandas as pd

metadata = pd.read_csv('gmc.csv', low_memory=False)

metadata.head(5)

Unnamed: 0,id,Name,Full_Address,Website,Plus_Code,Rating,Reviews,URL,Unnamed: 8
0,12,Buddies Burger,"Budapest, Magyar u. 52, 1053",buddies.hu,F3R6+5C Budapest,4.7,676,https://www.google.com/maps/place/Buddies+Burg...,
1,11,Borssó Bistro,"Budapest, Királyi Pál u. 14, 1053",borsso.hu,F3Q5+VR Budapest,4.7,790,https://www.google.com/maps/place/Borss%C3%B3+...,
2,10,Paprika Jancsi Restaurant,"Budapest, Ráday u. 16, 1092",paprikajancsietterem.hu,F3Q7+36 Budapest,4.2,1607,https://www.google.com/maps/place/Paprika+Janc...,
3,9,WokZilla,"Budapest, Ráday u. 33a, 1092",wokzilla.hu,F3P7+3R Budapest,4.2,762,https://www.google.com/maps/place/WokZilla/@47...,
4,8,Tifliso restaurant/ Tifliszo grúz étterem,"Budapest, Ráday u. 11-13, 1092",tifliszo.hu,F3P7+W5 Budapest,4.6,107,https://www.google.com/maps/place/Tifliso+rest...,


In [2]:
# Calculate mean of rating column
C = metadata['Rating'].mean()
print(C)

4.425


In [3]:
metadata = metadata.replace({",":""}, regex=True)

In [4]:
metadata["Reviews"] = pd.to_numeric(metadata["Reviews"], downcast="float")
#metadata["Reviews"] = metadata["Reviews"].astype(float)

In [5]:
#metadata.head(5)

In [6]:
# Calculate the minimum number of reviews required to be in the chart, m
m = metadata['Reviews'].quantile(0.90)
print(m)

1544.9000000000003


In [7]:
# Filter out all qualified restaurants into a new DataFrame
q_restaurants = metadata.copy().loc[metadata['Reviews'] >= m]
q_restaurants.shape

(2, 9)

In [8]:
metadata.shape

(12, 9)

In [9]:
# Function that computes the weighted rating of each restaurant
def weighted_rating(x, m=m, C=C):
    v = x['Reviews']
    R = x['Rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_restaurants['score'] = q_restaurants.apply(weighted_rating, axis=1)

In [11]:
#Sort restaurants based on score calculated above
q_restaurants = q_restaurants.sort_values('score', ascending=False)

#Print the top 2 restaurants
q_restaurants[['Name', 'Reviews', 'Rating', 'score']].head(2)

Unnamed: 0,Name,Reviews,Rating,score
10,Manga Cowboy!,1927.0,4.4,4.411124
2,Paprika Jancsi Restaurant,1607.0,4.2,4.310283


In [12]:
metadata['Full_Address'].head()

0         Budapest Magyar u. 52 1053
1    Budapest Királyi Pál u. 14 1053
2          Budapest Ráday u. 16 1092
3         Budapest Ráday u. 33a 1092
4       Budapest Ráday u. 11-13 1092
Name: Full_Address, dtype: object

In [13]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['Full_Address'] = metadata['Full_Address'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['Full_Address'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(12, 21)