# Universal imports

In [1]:
from google.colab import drive
import pandas as pd

In [2]:
# Import BERT-ready data from Gdrive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# other git stuff

In [9]:
cd /content/gdrive/MyDrive/TechLabs/{repository}

/content/gdrive/MyDrive/TechLabs/ss23-drop-in-to-berlin


In [None]:
!git branch KNN_classifier recommender

In [6]:
!git checkout KNN_classifier

M	keyword extraction.ipynb
Already on 'KNN_classifier'
Your branch is up to date with 'origin/KNN_classifier'.


In [10]:
!git remote set-url origin https://{git_token}@github.com/{username}/{repository}

In [11]:
!git branch

* [32mKNN_classifier[m
  main[m
  recommender[m


In [9]:
!git config --global user.email "phylanx@gmx.de"
!git config --global user.name "phylanxy"

In [13]:
!git add .

In [14]:
!git commit -m "adding UMAP"

[KNN_classifier 8d6e773] adding UMAP
 1 file changed, 1 insertion(+), 1 deletion(-)
 rewrite NN Classifier.ipynb (96%)


In [15]:
!git push

Enumerating objects: 5, done.
Counting objects:  20% (1/5)Counting objects:  40% (2/5)Counting objects:  60% (3/5)Counting objects:  80% (4/5)Counting objects: 100% (5/5)Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects:  33% (1/3)Compressing objects:  66% (2/3)Compressing objects: 100% (3/3)Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)Writing objects:  66% (2/3)Writing objects: 100% (3/3)Writing objects: 100% (3/3), 15.12 KiB | 1.51 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas:   0% (0/1)[Kremote: Resolving deltas: 100% (1/1)[Kremote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/TechLabs-Berlin/ss23-drop-in-to-berlin
   4e7f604..8d6e773  KNN_classifier -> KNN_classifier


In [12]:
username = "TechLabs-Berlin"
repository = "ss23-drop-in-to-berlin"
git_token = ""

# Loading the data & data cleaning


In [16]:
df = pd.read_csv("/content/gdrive/MyDrive/Ironhack/Final_project/df_with_model_input_no_NaNs.csv")

In [17]:
# function to remove items specified in a list
def preprocess_input(string, lst_to_remove):
  for item in lst_to_remove:
    string = string.replace(item, "")
  if len(string.split()) < 4:
    string = None
  else:
    pass
  return string

# create a list to remove strings that don't carry meaning
remove_lst = ["'review0': ","'review1': ","'review2': ","'review3': ","'review4': ", "'editorial_summary':"]

In [18]:
# drop old indeces
df.drop(["Unnamed: 0", "Unnamed: 0.1"], inplace=True, axis=1)

In [19]:
# turn model input into the review text without the items specified in remove_lst
df["model_input"] = df["model_input"].apply(lambda x: preprocess_input(str(x), remove_lst) if x != None else "")

In [20]:
# drop NaNs from model_input col, otherwise embedding won't work later on
df.dropna(subset="model_input", inplace=True)

In [21]:
# convert everything into str, add empty string for anything that's not a string (keyBERT won't accept NaNs as input, but I want to preserve the order of the inputs, for correct assignemetn of the outputs to the corresponding rows)
# and then create a list of texts for keyBERT to process
BERTs = df["model_input"]

# use KNN on embeddings directly

In [None]:
!pip install sentence_transformers

In [61]:
from sentence_transformers import SentenceTransformer
import pickle

In [27]:
# build sentence embeddings based on all-MiniLM-L6-v2
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(df["model_input"].to_numpy())

In [62]:
#Store reviews & embeddings on disc
with open('embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': df["model_input"].to_numpy(), 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
embeddings.shape

(4273, 384)

In [49]:
# import stuff for NN algorithm
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [50]:
# create a user input test
user_input = "This is really one of the most delicious vietnamnese restaurants I've ever seen. It tastes exacly like when my mom used to cook. Especially the Pho was incredible. Would always come back"
user_embedding = model.encode(user_input).reshape(1,-1)

In [51]:
user_embedding.shape

(1, 384)

In [52]:
# build NearestNeighbors model that finds the 20 nearest restaurants to the user input
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(embeddings)
distances, indices = nbrs.kneighbors(user_embedding)

In [53]:
indices

array([[ 456, 3826, 1249, 3582,  163, 1262, 3907, 4014, 1084, 1291, 3165,
        1117, 3887,  878, 3964, 1789, 3621, 2701,  999, 1396]])

In [54]:
# flatten array to use as index in dataframe
indices = indices.flatten()

In [None]:
# retrieve IDs of the restaurants to be sent to the app
df.iloc[indices]

## implement UMAP reducer to decrease dimensionality
## (this is not working -- aborting the experiment)

In [24]:
!pip install umap-learn



In [25]:
import umap.umap_ as umap

In [41]:
# define partial test vector for the embeddings
embs_test = embeddings[:100]

# fit and transform data using UMAP
reducer = umap.UMAP(n_components=10) # define reducer
umap_embs = reducer.fit_transform(embs_test) # return lower dimensionality vector

In [42]:
# create a user input test
user_input = "This is really one of the most delicious vietnamnese restaurants I've ever seen. It tastes exacly like when my mom used to cook. Especially the Pho was incredible. Would always come back"
user_embedding = model.encode(user_input).reshape(1,-1)
user_embs_umap = reducer.fit_transform(user_embedding)

In [43]:
umap_embs.shape, user_embs_umap.shape

((100, 10), (1, 10))

In [44]:
embs_test.shape

(100, 384)

In [45]:
# build NearestNeighbors model that finds the 20 nearest restaurants to the user input
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(umap_embs)
distances, indices = nbrs.kneighbors(user_embs_umap)

In [46]:
distances, indices

(array([[17.99023126, 18.00834927, 18.01500212, 18.09331552, 18.14265953,
         18.16667835, 18.16842586, 18.19266265, 18.19817152, 18.25194706,
         18.25717969, 18.26244017, 18.27342963, 18.29504493, 18.29722028,
         18.30400719, 18.32640679, 18.48377698, 18.51090626, 18.53478569]]),
 array([[57, 36, 88, 78, 79, 73, 65, 45, 75, 86, 54, 31, 26, 62, 13, 93,
         66, 21, 49, 33]]))

In [47]:
df.iloc[indices.flatten()]

Unnamed: 0,Unnamed: 0.2,name,reference,geometry,formatted_address,price_level,rating,user_ratings_total,types,editorial_summary,...,wheelchair_accessible_entrance,website,international_phone_number,photos,vicinity,url,business_status,permanently_closed,plus_code,model_input
57,57,CaliBocca Restaurant & Wine Bar,ChIJ1Vrbe-RQqEcRnpasKjzVzTY,"{'location': {'lat': 52.5038995, 'lng': 13.316...","Schlüterstraße 30, 10629 Berlin, Germany",2.0,4.6,386.0,"['restaurant', 'food', 'point_of_interest', 'e...","{'language': 'en', 'overview': 'Convivial bar/...",...,1.0,http://www.cali-bocca.com/,+49 30 72023301,"[{'height': 2563, 'html_attributions': ['<a hr...","Schlüterstraße 30, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'G838+HQ Berlin', 'global_co...",'pasta was great although pizza was not extrao...
36,36,Ristorante Portofino,ChIJJZExZt5QqEcReiy8E2SmySc,"{'location': {'lat': 52.50693949999999, 'lng':...","Kantstraße 63, 10627 Berlin, Germany",2.0,4.2,515.0,"['restaurant', 'food', 'point_of_interest', 'e...","{'language': 'en', 'overview': 'Nautically the...",...,,,+49 30 3236053,"[{'height': 3024, 'html_attributions': ['<a hr...","Kantstraße 63, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'G843+QM Berlin', 'global_co...","'ordered pizza marinara and inferno, along wit..."
88,88,Standard Serious Pizza,ChIJB_FI3vxRqEcRheAWfledrdA,"{'location': {'lat': 52.532864, 'lng': 13.4086...","Templiner Str. 7, 10119 Berlin, Germany",2.0,4.3,2527.0,"['restaurant', 'food', 'point_of_interest', 'e...","{'language': 'en', 'overview': 'Industrial-chi...",...,0.0,http://www.standard-berlin.de/,+49 30 48625614,"[{'height': 1462, 'html_attributions': ['<a hr...","Templiner Str. 7, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'GCM5+4F Berlin', 'global_co...",'standard serious pizza had been on our to do ...
78,78,Da Piada,ChIJrZ8OGuRRqEcRLGuMfehHnk0,"{'location': {'lat': 52.5278283, 'lng': 13.400...","Auguststraße 49A, 10119 Berlin, Germany",,4.0,147.0,"['restaurant', 'food', 'point_of_interest', 'e...",,...,,http://www.dapiada.de/,+49 30 54481948,"[{'height': 1936, 'html_attributions': ['<a hr...","Auguststraße 49A, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'GCH2+49 Berlin', 'global_co...","'a little family restaurant, cosy vibe. best c..."
79,79,Vino e libri,ChIJT9z_vvxRqEcRQvYDaqBLNS0,"{'location': {'lat': 52.5332028, 'lng': 13.406...","Choriner Str. 72, 10119 Berlin, Germany",2.0,4.5,232.0,"['restaurant', 'food', 'point_of_interest', 'e...",,...,0.0,https://vinoelibri.eatbu.com/,+49 30 44058471,"[{'height': 4032, 'html_attributions': ['<a hr...","Choriner Str. 72, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'GCM4+7P Berlin', 'global_co...",'amazing pizza and pasta! one of the best pizz...
73,73,Restaurant Via Nova II,ChIJD3C43txRqEcRosYHlSG8PSM,"{'location': {'lat': 52.51981, 'lng': 13.39222...","Universitätsstraße 2, 10117 Berlin, Germany",2.0,4.1,714.0,"['restaurant', 'food', 'point_of_interest', 'e...","{'language': 'en', 'overview': 'Refined Medite...",...,1.0,https://www.vianova2.com/,+49 30 20214861,"[{'height': 1000, 'html_attributions': ['<a hr...","Universitätsstraße 2, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'G99R+WV Berlin', 'global_co...",'this restaurant is unfortunately only fake it...
65,65,Da Vinci,ChIJNX291dxRqEcRUIJ_TyFnZL0,"{'location': {'lat': 52.5201506, 'lng': 13.391...","Georgenstraße 192, 10117 Berlin, Germany",2.0,4.3,1350.0,"['restaurant', 'food', 'point_of_interest', 'e...","{'language': 'en', 'overview': 'Antipasti, pas...",...,1.0,http://www.davinci-ristorante.de/,+49 30 20143143,"[{'height': 2250, 'html_attributions': ['<a hr...","Georgenstraße 192, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'G9CR+3G Berlin', 'global_co...",'one of the best place for original italian pi...
45,45,Tiamo Restaurant berlin,ChIJ1wLfeL5aqEcRlAtXMWA8gfE,"{'location': {'lat': 52.4467753, 'lng': 13.343...","Albrechtstr. 69, 12167 Berlin, Germany",1.0,3.9,718.0,"['restaurant', 'food', 'point_of_interest', 'e...",,...,,http://www.restaurant-tiamo.de/,+49 30 79746300,"[{'height': 2160, 'html_attributions': ['<a hr...","Albrechtstr. 69, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'C8WV+P7 Berlin', 'global_co...",'- very friendly and helpful staff- made me an...
75,75,Viale dei Tigli,ChIJ6wFH3MVRqEcRgOTx5SIY3Ys,"{'location': {'lat': 52.5148883, 'lng': 13.381...","Wilhelmstraße 75, 10117 Berlin, Germany",2.0,3.9,872.0,"['restaurant', 'food', 'point_of_interest', 'e...","{'language': 'en', 'overview': 'Long-standing,...",...,1.0,http://www.ristorante-viale.de/,+49 30 2297405,"[{'height': 3024, 'html_attributions': ['<a hr...","Wilhelmstraße 75, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'G97J+XF Berlin', 'global_co...",'very good place for italian food! pizza and p...
86,86,Alimentari,ChIJyRESH_tRqEcRcTrX03kcwG8,"{'location': {'lat': 52.5339432, 'lng': 13.401...","Fehrbelliner Str. 54, 10119 Berlin, Germany",,4.6,163.0,"['store', 'restaurant', 'food', 'point_of_inte...",,...,,,+49 30 53604815,"[{'height': 4032, 'html_attributions': ['<a hr...","Fehrbelliner Str. 54, Berlin",https://www.google.com/maps/place/?q=place_id:...,OPERATIONAL,,"{'compound_code': 'GCM2+HQ Berlin', 'global_co...","'really good food for a reasonable price, we t..."


# code snippets

In [None]:
#Load sentences & embeddings from disc
with open('embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']