In [1]:
# import clean df in here
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

clean_df = pd.read_csv('/content/drive/MyDrive/Academics/Chatbots & Reco/Workspaces Project/workspaces_clean.csv', index_col=0)

Mounted at /content/drive


In [2]:
clean_df

Unnamed: 0,Workspace_Id,Name,Rating,Review_count,Price_range,Category,Address,Latitude,Longitude,Next_status
0,0,Eugenio Trias Municipal Public Library,3.8,800,0,Public library,"P.º de Fernán Núñez, 24",40.416705,-3.679161,Opens 8:30 AM Mon
1,1,Iván de Vargas Library,4.3,313,0,Public library,"C. de San Justo, 5",40.413991,-3.709750,Opens 8:30 AM Mon
2,2,Biblioteca Mario Vargas Llosa,3.8,178,0,Public library,"C. de Barceló, 4",40.426713,-3.699394,Opens 8:30 AM Mon
3,3,Pedro Salinas Library,4.0,337,0,Public library,"Gta. de la Prta de Toledo, 1",40.407074,-3.710894,Opens 9 AM Mon
4,4,Acuna Public Library,2.9,118,0,Public library,"C. de Quintana, 9",40.427932,-3.716937,Opens 9 AM Mon
...,...,...,...,...,...,...,...,...,...,...
259,259,Harina,3.9,434,2,Coffee shop,"C. de Velázquez, 61",40.429262,-3.684050,Closes 9 PM
260,260,The Coffee Corner,4.3,314,1,Coffee shop,"Av. de Valladolid, 41",40.428630,-3.729667,Closes 9 PM
261,261,The Bear and the Madroño,4.4,590,1,Espresso bar,"C. del Doce de Octubre, 16",40.415687,-3.675956,Closes 10:30 PM
262,262,Cafés Pozo,4.6,52,0,Coffee store,"C. de Miguel Arredondo, 4",40.394994,-3.695993,Closes 2 PMReopens 5 PM


First, we need to work with our category column so that we can get the cosine similarity from it. To do this, we can perform some one hot encoding on this variable so that it is binned in binary values. In other words, this will create a unique binary array for each category so that they can be identified numerically.

In [3]:
categoryList = []
for index, row in clean_df.iterrows():
    categories = row["Category"]
    
    for category in categories:
        if category not in categoryList:
            categoryList.append(category)

In [4]:
def binary(category_list):
    binaryList = []
    
    for category in categoryList:
        if category in category_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [5]:
clean_df['category_bin'] = clean_df['Category'].apply(lambda x: binary(x))
clean_df['category_bin'].head()

0    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
1    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
2    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
4    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
Name: category_bin, dtype: object

In [6]:
clean_df.head()

Unnamed: 0,Workspace_Id,Name,Rating,Review_count,Price_range,Category,Address,Latitude,Longitude,Next_status,category_bin
0,0,Eugenio Trias Municipal Public Library,3.8,800,0,Public library,"P.º de Fernán Núñez, 24",40.416705,-3.679161,Opens 8:30 AM Mon,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
1,1,Iván de Vargas Library,4.3,313,0,Public library,"C. de San Justo, 5",40.413991,-3.70975,Opens 8:30 AM Mon,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
2,2,Biblioteca Mario Vargas Llosa,3.8,178,0,Public library,"C. de Barceló, 4",40.426713,-3.699394,Opens 8:30 AM Mon,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
3,3,Pedro Salinas Library,4.0,337,0,Public library,"Gta. de la Prta de Toledo, 1",40.407074,-3.710894,Opens 9 AM Mon,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
4,4,Acuna Public Library,2.9,118,0,Public library,"C. de Quintana, 9",40.427932,-3.716937,Opens 9 AM Mon,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."


Now we create a function to get the cosine similaity based on any two given categories that are passed in.

In [7]:
from scipy import spatial

def similarity(workspaceId1, workspaceId2):
    a = clean_df.iloc[workspaceId1]
    b = clean_df.iloc[workspaceId2]
    
    categoryA = a['category_bin']
    categoryB = b['category_bin']
    
    categoryDistance = spatial.distance.cosine(categoryA, categoryB)
    
    return categoryDistance 


Now for the final step, which is implementing a score predictor. This will hinge upon the similarity function we just built to analyze the k-nearest neighbors. Here's how it works:

The first thing it does is ask the user to enter a location. This could be a previous location they particularly enjoyed. Then, we get the k-nearest neighbors based on the cosine similarity of each cateogry corresponding to each neighboring workspace. We then rank the list of recommendations based on their cosine similarity score and print the result which includes the name of the location as well as the star rating it has on Google Maps.

In [8]:
import operator

def predict_score():
    name = input('Enter a location: ')
    new_location = clean_df[clean_df['Name'].str.contains(name)].iloc[0].to_frame().T
    print('Selected Location: ',new_location.Name.values[0])

    def getNeighbors(baseLocation, K):
        distances = []
    
        for index, location in clean_df.iterrows():
            if location['Workspace_Id'] != baseLocation['Workspace_Id'].values[0]:
                dist = similarity(baseLocation['Workspace_Id'].values[0], location['Workspace_Id'])
                distances.append((location['Workspace_Id'], dist))
    
        distances.sort(key=operator.itemgetter(1))
        neighbors = []
    
        for x in range(K):
            neighbors.append(distances[x])
        return neighbors
    
    K = 10
    neighbors = getNeighbors(new_location, K)
    print('\nRecommended Workspaces: \n')
    for neighbor in neighbors: 
        print( "\n\nIndex: ", clean_df.iloc[neighbor[0]][0], "\nName: ",str(clean_df.iloc[neighbor[0]][1]).strip('[]'), "\nRating: ", str(clean_df.iloc[neighbor[0]][2]) )
    

In [9]:
predict_score()

Enter a location: Harina
Selected Location:  Harina

Recommended Workspaces: 



Index:  97 
Name:  Randall Coffee Roasters 
Rating:  4.7


Index:  98 
Name:  Hola Coffee Fourquet 
Rating:  4.4


Index:  99 
Name:  Santa Kafeina 
Rating:  4.8


Index:  100 
Name:  Urbano Specialty Coffee 
Rating:  4.5


Index:  101 
Name:  East Crema Coffee - Hermosilla (Specialty Coffee Café de especialidad en Madrid) 
Rating:  4.6


Index:  102 
Name:  Bianchi Kiosko Caffé 
Rating:  4.7


Index:  105 
Name:  The Fix - Café de Especialidad 
Rating:  4.6


Index:  106 
Name:  Luso Coffee Shop | Madrid 
Rating:  4.4


Index:  173 
Name:  Randall Coffee Roasters 
Rating:  4.7


Index:  176 
Name:  Taruffi 
Rating:  4.1
