# Homework 3 - Places of the world

# 0 - Import libraries

As usual we import all the libraries that we may need in the notebook.

In [None]:
%%capture
import numpy as np
import pandas as pd 
import requests
import time
import csv
import os
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from multiprocessing import Pool
import json
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from functools import reduce

In [None]:
%%capture
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.parse.earleychart import FilteredCompleteFundamentalRule
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
# libraries
import geoplot as gplt
import geopandas as gpd
import geoplot.crs as gcrs
import imageio
import pathlib
import matplotlib.pyplot as plt
import mapclassify as mc
# for the plotly function
import plotly.express as px
# for determining city and country based on lat,lon
from geopy.geocoders import Nominatim


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1 - Data collection

## 1.1 Get the list of places

In this section we write our code that, by means of BeautifulSoup, given the url of the main page of `atlasobscura` and an upper and lower limit of pages, collects all the page urls in the range.





In [None]:
list_url = []
# loop over the first 400 pages
for i in tqdm(range(1, 401)):
        # define the url per page
        url = 'https://www.atlasobscura.com/places?sort=likes_count&page='+str(i)
        r = requests.get(url)
        soup = bs(r.content, 'lxml')
        links = soup.find_all('a', "content-card content-card-place") 

        for item in links:
         list_url.append(item.get('href'))

100%|██████████| 400/400 [03:05<00:00,  2.15it/s]


Once we get a list containing all the URLs, we write them into a text file, so that it will only need to load this file without re-executing the code.

In [None]:
doc = open("Places.txt", 'w', encoding="utf8")
doc.write('\n'.join(list_url))
doc.close

## 1.2 Crawl places

In the first place, we tried to scrape all 7200 pages directly one after the other. We noticed, however, that even though the function didn't return any error most of the retrieved html files were blank and only contained the stamp "Retry later". As is came out that it was error 429 - Too Many Requests, we slightly adjusted the code to simulate the behaviour of a bot in the HTTP request and also inserting some pauses with sleep() between a set of requests and the other. This is the final version, that overall took some hours to complete.

Function that retrieves the HTML for all 18 places of a page, given the page number:

In [None]:
#The function parameter is the number of the page we're scraping (each page contains 18 places).
#Later we will call this function on all 400 pages, 18 places each, for a total of 7200 files.

def get_html(n_page: int):
  # - Define the directory path based on the page number
  # - Create the page folder given the path, ex. page175
  path = "/content/drive/MyDrive/hw3_aris/HTML/page{}".format(n_page) 
  os.makedirs(path)

  #From the file we created in Question 1.1, retrieve the partial URLs of the 18 places of the current page
  doc = open("/content/drive/MyDrive/hw3_aris/Places.txt", 'r', encoding="utf8")
  contents = doc.readlines()[(n_page-1)*18:(n_page)*18]
  doc.close

  #Create the index of the first place we'll start working on
  position = 1+18*(n_page-1)

  #Initialize the list of actual URLs
  list_url_page = []

  #Append partial URLs to the standard prefix, in order to have the list of complete URLs for current 18 places
  for end_url in contents:
    url = 'https://www.atlasobscura.com' + str(end_url)
    list_url_page.append(url.rstrip())

  #For each place:
  # - Perform a HTTP request to get its HTML
  # - Create a file.html in the given path named by the position index, ex. .../page1/16.html
  # - Write the HTML content of a place in its file
  # - Update the position index of the place
  for url_page in list_url_page:
    html = requests.get(url_page, headers = {'User-agent': 'Super Bot Pluto Zeronium AI'}) #declare ourselves as a bot
    save = '{}/{}.html'.format(path, position)
    writing = open(save, 'w', encoding="utf8")
    writing.write(html.text)
    writing.close
    position +=1

Below follows the code to run the previous function on all 400 pages. We noticed that 16 pages were about the maximum amount that the server could handle before failing to respond, so we decided to scrape 16 pages at the time with a pause of 10 seconds between a page and the other (of the same set of 18) and 30 seconds between an iteration and the other (25 in total).

In [None]:
#Import sleep to add pauses between iterations
from time import sleep

In [None]:
# - Create an index i that goes from 1 to 25 (400 / 16 = 25)
# - Create a list of indexes that go from 1 to 400, that we'll scan with a step of 16
# - For each value of i, run get_html on the 16 pages of the i-th iteration

i = 0
indexes = list(range(1, 401))

while i <= 24:
  for page in tqdm(indexes[16*i:16*(i+1)]):
      get_html(page)
      sleep(10)
  i += 1

  sleep(30)

100%|██████████| 16/16 [03:32<00:00, 13.26s/it]
100%|██████████| 16/16 [03:25<00:00, 12.84s/it]
100%|██████████| 16/16 [03:32<00:00, 13.29s/it]
100%|██████████| 16/16 [03:28<00:00, 13.04s/it]
100%|██████████| 16/16 [03:22<00:00, 12.66s/it]
100%|██████████| 16/16 [03:39<00:00, 13.69s/it]
100%|██████████| 16/16 [03:36<00:00, 13.54s/it]
100%|██████████| 16/16 [03:31<00:00, 13.24s/it]
100%|██████████| 16/16 [03:29<00:00, 13.11s/it]
100%|██████████| 16/16 [03:32<00:00, 13.30s/it]
100%|██████████| 16/16 [03:24<00:00, 12.76s/it]
100%|██████████| 16/16 [03:26<00:00, 12.89s/it]
100%|██████████| 16/16 [03:33<00:00, 13.36s/it]
100%|██████████| 16/16 [03:25<00:00, 12.87s/it]
100%|██████████| 16/16 [03:21<00:00, 12.60s/it]
100%|██████████| 16/16 [03:31<00:00, 13.24s/it]
100%|██████████| 16/16 [03:28<00:00, 13.04s/it]
100%|██████████| 16/16 [03:29<00:00, 13.09s/it]
100%|██████████| 16/16 [03:38<00:00, 13.65s/it]
100%|██████████| 16/16 [03:30<00:00, 13.15s/it]
100%|██████████| 16/16 [03:25<00:00, 12.

## 1.3 Parse downloaded pages

Firs of all, define some functions to extract specific elements from the HTML document of a single place:

In [None]:
#1. Place Name
def getname(s):
  return s.find('h1',{'class':'DDPage__header-title'}).contents[0]

#2. Place Tags
def gettags(s):
  tags_list = []
  for t in s.find_all('a', {'class':'itemTags__link js-item-tags-link'}):
    tags_list.append(t.text.rstrip().lstrip())
  return tags_list

#3. Number of people who have been there
def getnvisitors(s):
  return s.find('div', {'class': 'title-md item-action-count'}).contents[0]

#4. Numbers of people who want to visit the place
def getnpotentials(s):
  return s.find('div', {'class': 'title-md item-action-count'}).find_next('div', {'class': 'title-md item-action-count'}).contents[0]

#5. Long Description
def getlongdesc(s):
  return s.find('div',{'id':'place-body'}).text.rstrip().lstrip() 

#6. Short Description
def getshortdesc(s):
  return s.find('h3',{'class':'DDPage__header-dek'}).contents[0].rstrip()

#7. Nearby Places
def getnearby(s):
  nearby_list = []
  nearby_div = s.find_all('div', {'class':'DDPageSiderailRecirc__item-title'})
  if nearby_div is not None:
    for p in nearby_div:
      nearby_list.append(p.text.rstrip().lstrip())
  return list(np.unique(nearby_list)) #only keep unique values

#8. Address
def getaddress(s):
  div_address = s.find('address', {'class':'DDPageSiderail__address'}).find_next('div')
  if div_address is not None:
    row1 = div_address.contents[0] + ' ' 
    row2 = div_address.contents[2] + ' ' 
    row3 = div_address.contents[4].rstrip()
    return row1 + row2 + row3

#9. Altitude and Longitude of the place's location
def getcoords(s):
  coords = s.find('div',{'class':'DDPageSiderail__coordinates js-copy-coordinates'})
  if coords is not None:
    coords = coords.text.rstrip().lstrip().split(', ') #returns a list of coordinates
    return float(coords[0]), float(coords[1]) #lat, long

def getcoords1(s):
  coords = s.find('div',{'class':'DDPageSiderail__coordinates js-copy-coordinates'})
  if coords is not None:
    return coords.text.rstrip().lstrip() #returns just the coordinates string

#10. The username of the post editors
def geteditors(s):
  editors = []
  ris = s.find('div', {'class': 'js-editor-list hidden'})
  if ris is not None:
    for e in ris.find_all('span'):
      editors.append(e.text.rstrip().lstrip())
  else:
    for e in s.find_all('a', {'class':'DDPContributorsList__contributor'})[1:]:
      editors.append(e.text.rstrip().lstrip())
  return editors

#11. Post publishing date
def getdate(s):
  d = s.find('div',{'class':'DDPContributor__name'})
  if d is not None:
    return d.text.rstrip().lstrip()

#12 The names of the lists that the place was included in
def getrelatedlist(s):
  list_s = []
  first = s.find('div',{'class':'card-grid CardRecircSection__card-grid js-inject-gtm-data-in-child-links'})
  if first is not None:
    second = first.find_next('div',{'class':'card-grid CardRecircSection__card-grid js-inject-gtm-data-in-child-links'})
    if second is not None:
      third = second.find_next('div',{'class':'card-grid CardRecircSection__card-grid js-inject-gtm-data-in-child-links'})
      if third is not None:
        for t in third.find_all('h3', {'class':'Card__heading --content-card-v2-title js-title-content'}):
          list_s.append(t.text.rstrip().lstrip())
  return list_s

#13 The names of the lists that the place was included in
def getrelatedplaces(s):
  list_s = []
  first = s.find('div',{'class':'card-grid CardRecircSection__card-grid js-inject-gtm-data-in-child-links'})
  if first is not None:
    second = first.find_next('div',{'class':'card-grid CardRecircSection__card-grid js-inject-gtm-data-in-child-links'})
    if second is not None:
      for t in second.find_all('h3', {'class':'Card__heading --content-card-v2-title js-title-content'}):
        list_s.append(t.text.rstrip().lstrip())
  return list_s

#14 URL
def geturl(s):
  return s.find('link', {'rel':'canonical'}).get('href')

#Add-on
def getlocation(s): #careful: not all places have both city and country
  loc = s.find('div', {'class': 'DDPage__header-place-location'}).text.split(',')
  city= loc[0].lstrip().rstrip()
  country = loc[1].lstrip().rstrip()
  return city, country

def getlocation1(s):
    loc = s.find('div', {'class': 'DDPage__header-place-location'})
    if loc is not None:
      return loc.text


To create the files place_i.tsv we tried two different approaches:
1. **Version 1**: go brute-force scraping again all the webpages. For each place, retrieve all the elements of interest and instantly write the related file. It's a slower solution that addresses the task head-on. The execution running time took about 3 hours;
2. **Version 2**: from the downloaded HTML documents, extract all the elements of interest and store them in a single DataFrame object containing all the data. In this DataFrame, the i-th row corresponds to the i-th place and each column to an element of interest (name, tags, people that visited...). Once the DataFrame is complete, simply write each row in its own file. The DataFrame construction took about 30 mins and the writing function 1 min, so in the end it's a faster solution though it requires more processing.

### Version 1

We retrieve all the data of a single page and write the file.tsv. Steps:
  - Define a function `get_info` that retrieves all the info of a single place;
  - Define a function `write_tsv` that writes to a .tsv file the result of the previous one for a single *place*. This function is to be called on a single *page* containing 18 places and cycles on all of them;
  - Execute the second function on all 400 pages.

In [None]:
def get_info(soup):
  #store in different variables the result of the functions that extract elements
  placeName = getname(soup)
  placeTags = gettags(soup)
  numPeopleVisited = getnvisitors(soup)
  numPeopleWant = getnpotentials(soup)
  placeDesc = getlongdesc(soup)
  placeShortDesc = getshortdesc(soup)
  placeNearby = getnearby(soup)
  placeAddress = getaddress(soup)
  placeAlt, placeLong = getcoords(soup)
  placeEditors = geteditors(soup)
  placePubDate = getdate(soup)
  placeRelatedLists = getrelatedlist(soup)
  placeRelatedPlaces = getrelatedplaces(soup)
  placeURL = geturl(soup)

  #create a list of all the retrieved data
  list_data = [placeName, placeTags, numPeopleVisited, numPeopleWant, placeDesc,
              placeShortDesc, placeNearby, placeAddress, placeAlt, placeLong, 
              placeEditors, placePubDate, placeRelatedLists, placeRelatedPlaces, placeURL]
  
  #return the list
  return list_data

In [None]:
def write_tsv_1(n_page): 
  #create the directory page(j-th) under the folder tsv_1
  path = "tsv_1/page{}".format(n_page)
  os.makedirs(path)

  #get the partial URLs of the 18 places contained in the j-th page
  doc = open("/content/Places.txt", 'r', encoding="utf8")
  contents = doc.readlines()[(n_page-1)*18:(n_page)*18]
  doc.close

  #create the index of the position of i-th place of the j-th page
  position = 1 + 18 * (n_page - 1)

  #create a list of the complete URLs of the 18 places
  list_url_page = []

  for end_url in contents:
    url = 'https://www.atlasobscura.com' + str(end_url)
    list_url_page.append(url.rstrip())

  #For each place:
  # - Get its HTML
  # - Extract elements of interest
  # - Save them in its own .tsv file inside its page folder
  for url_page in list_url_page:
    p = requests.get(url_page) 
    soup = bs(p.text)
    result = get_info(soup)
    file_name = "place_" + str(position)
    save = '{}/{}.tsv'.format(path, file_name)
    writing = open(save, 'w', encoding="utf8")
    writing.write('\t'.join(map(str, result)))
    writing.close
    position +=1

Execute on all 400 pages:

In [None]:
for page in tqdm(range(1, 401)):
    write_tsv_1(page)

100%|██████████| 400/400 [3:20:13<00:00, 30.03s/it]


### Version 2

From the HTML previously downloaded, we retrieve all the elements for all the places, use them as columns for a DataFrame object we create (and save) and then store each row in its own separated .tsv file. Steps:
  - Iterate on the 400 page folders contained in the HTML directory;
  - For every 18 places, call a function `append_data` that appends all the relevant elements of the place to a dedicated list;
  - Create the DataFrame using these lists as columns;
  - Write each row of the DataFrame in its own .tsv file.

In [None]:
#Initialize empty lists
placeName = []
placeTags = []
numPeopleVisited = []
numPeopleWant = []
placeDesc = []
placeShortDesc = []
placeNearby = []
placeAddress = []
placeLat = []
placeLong = []
placeEditors = []
placePubDate = []
placeRelatedLists = []
placeRelatedPlaces = []
placeURL = []
placeLocation = []

In [None]:
#Function that appends to the previous lists all the relative elements of a single place
def append_data(soup):
    placeName.append(getname(soup))
    placeTags.append(gettags(soup))
    numPeopleVisited.append(getnvisitors(soup))
    numPeopleWant.append(getnpotentials(soup))
    placeDesc.append(getlongdesc(soup))
    placeShortDesc.append(getshortdesc(soup))
    placeNearby.append(getnearby(soup))
    placeAddress.append(getaddress(soup))
    l1, l2 = getcoords(soup)
    placeLat.append(l1)
    placeLong.append(l2)
    placeEditors.append(geteditors(soup))
    placePubDate.append(getdate(soup))
    placeRelatedLists.append(getrelatedlist(soup))
    placeRelatedPlaces.append(getrelatedplaces(soup))
    placeURL.append(geturl(soup))
    placeLocation.append(getlocation1(soup)) #getlocation1 fix

After having defined some useful functions, we need retrieve the names of the page folders we are going to work on:

In [None]:
#Retrieve all the folder names in HTML directory sorting them by number.
#This is needed to pass the right path to the function in the next code block.

pages = sorted(os.listdir('/content/drive/MyDrive/hw3_aris/HTML'), key = lambda page : int(page[4:]))

#pages = [page1, page2, ..., page400]

To avoid running out of RAM, we have to do the next phase in two steps. 

We split the set of pages in two halves: working separately on them we create two different DataFrames that in the end will be concatenated. Thus we'll obtain the complete dataset.

In [None]:
#Splitting the pages in halves
pages_1 = pages[:len(pages)//2]
pages_2 = pages[len(pages)//2:]

Process the ***first*** half:

In [None]:
#Append data of the first half pages to extract information
for page in tqdm(pages_1):
  for i in range(1, 19):
    f = open("/content/drive/MyDrive/hw3_aris/HTML/{}/{}.html".format(page, i + (18 * (int(page[4:]) - 1)), 'r', encoding="utf8"))
    soup = bs(f, 'lxml')
    append_data(soup)

100%|██████████| 200/200 [12:45<00:00,  3.83s/it]


In [None]:
#Create the first dataframe out of the elements just extracted

#Initialize columns with features' names
cols = ['placeName', 'placeTags', 'numPeopleVisited',
        'numPeopleWant', 'placeDesc', 'placeShortDesc',
        'placeNearby', 'placeAddress', 'placeLat', 'placeLong',
        'placeEditors', 'placePubDate', 'placeRelatedLists',
        'placeRelatedPlaces', 'placeURL', 'placeLocation']

#Initialize data types of every column
data_types = {'placeName':'object', 'placeTags':'object', 
         'numPeopleVisited':'int64', 'numPeopleWant':'int64', 
         'placeDesc':'object', 'placeShortDesc':'object',
        'placeNearby':'object', 'placeAddress':'object',
         'placeLat':'float64', 'placeLong':'float64', 'placeEditors':'object',
         'placePubDate':'object', 'placeRelatedLists':'object',
        'placeRelatedPlaces':'object', 'placeURL':'object',
         'placeLocation':'object'}

#Acquire dataframe with given columns and relative types
data_places_1 = pd.DataFrame(columns = cols).astype(dtype = data_types) 

#Filling the dataframe columns with the values previously retrieved
data_places_1.placeName = placeName
data_places_1.placeTags = placeTags
data_places_1.numPeopleVisited = numPeopleVisited
data_places_1.numPeopleWant = numPeopleWant
data_places_1.placeDesc = placeDesc
data_places_1.placeShortDesc = placeShortDesc
data_places_1.placeNearby = placeNearby
data_places_1.placeAddress = placeAddress
data_places_1.placeLat = placeLat
data_places_1.placeLong = placeLong
data_places_1.placeEditors = placeEditors
data_places_1.placePubDate = placePubDate
data_places_1.placeRelatedLists = placeRelatedLists
data_places_1.placeRelatedPlaces = placeRelatedPlaces
data_places_1.placeURL = placeURL
data_places_1.placeLocation = placeLocation

Process the ***second*** half:

<ins>Note:</ins> Before appending additional data, we need to re-initialize all the auxiliary variables that we used as columns to empty lists. It is enough to run again the code block where they're declared, to avoid adding redundant code.

In [None]:
#Append data of the second half pages to extract information
for page in tqdm(pages_2):
  for i in range(1, 19):
    f = open("/content/drive/MyDrive/hw3_aris/HTML/{}/{}.html".format(page, i + (18 * (int(page[4:]) - 1)), 'r', encoding="utf8"))
    soup = bs(f, 'lxml')
    append_data(soup)

100%|██████████| 200/200 [25:02<00:00,  7.51s/it]


In [None]:
#Create the second dataframe out of the elements just extracted

#Initialize columns with features' names
cols = ['placeName', 'placeTags', 'numPeopleVisited',
        'numPeopleWant', 'placeDesc', 'placeShortDesc',
        'placeNearby', 'placeAddress', 'placeLat', 'placeLong',
        'placeEditors', 'placePubDate', 'placeRelatedLists',
        'placeRelatedPlaces', 'placeURL', 'placeLocation']

#Initialize data types of every column
data_types = {'placeName':'object', 'placeTags':'object', 
         'numPeopleVisited':'int64', 'numPeopleWant':'int64', 
         'placeDesc':'object', 'placeShortDesc':'object',
        'placeNearby':'object', 'placeAddress':'object',
         'placeLat':'float64', 'placeLong':'float64', 'placeEditors':'object',
         'placePubDate':'object', 'placeRelatedLists':'object',
        'placeRelatedPlaces':'object', 'placeURL':'object',
         'placeLocation':'object'}

#Acquire dataframe with given columns and relative types
data_places_2 = pd.DataFrame(columns = cols).astype(dtype = data_types) 

#Filling the dataframe columns with the values previously retrieved
data_places_2.placeName = placeName
data_places_2.placeTags = placeTags
data_places_2.numPeopleVisited = numPeopleVisited
data_places_2.numPeopleWant = numPeopleWant
data_places_2.placeDesc = placeDesc
data_places_2.placeShortDesc = placeShortDesc
data_places_2.placeNearby = placeNearby
data_places_2.placeAddress = placeAddress
data_places_2.placeLat = placeLat
data_places_2.placeLong = placeLong
data_places_2.placeEditors = placeEditors
data_places_2.placePubDate = placePubDate
data_places_2.placeRelatedLists = placeRelatedLists
data_places_2.placeRelatedPlaces = placeRelatedPlaces
data_places_2.placeURL = placeURL
data_places_2.placeLocation = placeLocation

Check the dataframes' dimensionalities:

In [None]:
data_places_1.shape

(3600, 16)

In [None]:
data_places_2.shape

(3600, 16)

Combine the two DataFrames together:

In [None]:
data_places = pd.concat([data_places_1, data_places_2], ignore_index = True)

Save them in a file for later usage:

In [None]:
data_places.to_csv("/content/drive/MyDrive/hw3_aris/data_places.tsv", sep = '\t', index = False)

Finally, to create the .tsv files of all the places, simply write the rows of the DataFrame in the right place.

In [None]:
def write_tsv_2(n_page):
  #create the directory page(j-th) under the folder tsv_2
  path = "/content/drive/MyDrive/hw3_aris/tsv_2/page{}".format(n_page)
  os.makedirs(path)

  #For the k-th place out of 18 contained in this page:
  # - The k-th place position is: i + (18 * (n_page - 1)), and positions go from 1 to 7200
  # - The k-th place index in data_places is the position - 1, as data_places is indexed from 0 to 7199
  # - Extract the k-th place and write it in its own .tsv file
  for i in range(1, 19):
    data_places.iloc[[(i + (18 * (n_page - 1)) - 1)]].to_csv("{}/{}.tsv".format(path, i + (18 * (n_page - 1))), sep = '\t', index = False)

Execute on all 400 pages:

In [None]:
for p in tqdm(range(1, 401)):
    write_tsv_2(p)

100%|██████████| 400/400 [01:12<00:00,  5.51it/s]


----------------------------------------
\\
Once we have the file, to load the dataset run as usual:

In [None]:
#To load dataset from file
data_places = pd.read_csv("/content/drive/MyDrive/hw3_aris/data_places.tsv", sep = '\t')

# 2 - Search Engine

First, we must pre-process all the information collected for each place, so we define the following functions for:
1. Removing stopwords
2. Removing punctuation
3. Stemming
4. Tokenize


In [None]:
def stopwordRemove(Input:str)-> str:

  """This function is responsible for removing the English language stopwords from the string passed as input

  Args:
    - string (str): The string to be modified.

  Returns:
    - str : The string without the stopwords.
  """
  stop_words = set(stopwords.words('english'))
  w_tokens = word_tokenize(Input)
  filtered_words = [w for w in w_tokens if not w.lower() in stop_words]
  Output = (" ").join(filtered_words)
  return Output

In [None]:
def punctuationRemove(Input:str)-> str:

  """This function is responsible for removing the punctuations from the string passed as input

  Args:
    - string (str): The string to be modified.

  Returns:
    - str : The string without the punctuations.
  """
  f = Input.translate(str.maketrans('', '', string.punctuation))
  Output= f.replace('’', ' ')
  return Output

In [None]:
def stemming(Input:str)-> str:

  """This function is responsible for stemming the string passed as input

  Args:
    - string (str): The string to be modified.

  Returns:
    - str : The string after the stemming operation.
  """
  p=PorterStemmer()
  filtered=[]
  w_tokens = word_tokenize(Input)
  for w in w_tokens:
    filtered.append(p.stem(w))
  Output = " ".join(filtered)
  return Output

In [None]:
def getTokens(Input)->list:
  """This function is responsible for tokenize the words from the string passed as input
  Args:
    - string (str): The string to be modified.

  Returns:
    - list : The list of words.
  """
  Output = word_tokenize(Input)
  return Output

To make it more practical to use, we combine all the functions seen in one, we have proposed two versions of this, one that does not include tokenization (`cleanText()`) and one that includes it (`allOperationsText()`).

In [None]:
def cleanText(Input)-> str:
  """This function uses all the operations of the stopwordRemove(), punctuationRemove() and stemming() functions to the string passed as input

  Args:
    - string (str): The string to be modified.

  Returns:
    - str : The string modified.
  """
  f1= stopwordRemove(Input)
  f2= punctuationRemove(f1)
  Output = stemming(f2)
  return Output

In [None]:
def allOperationsText(Input)->list:
  """This function uses all the operations of the stopwordRemove(), punctuationRemove(), stemming() and getTokens() functions to the string passed as input
  Args:
    - string (str): The string to be modified.

  Returns:
    - list : The list of words from the cleaned string.
  """
  f1= stopwordRemove(Input)
  f2= punctuationRemove(f1)
  f3 = stemming(f2)
  Output = getTokens(f3)
  return Output

An example:

In [None]:
a = requests.get("https://www.atlasobscura.com/places/city-hall-station")
soup_a = bs(a.text)

In [None]:
t = getlongdesc(soup_a)
t

'The first New York City subway was built and operated by the Interborough Rapid Transit Company (IRT) and opened on October 27, 1904, to the joy of New York elevated train and streetcar riders.\nThe City Hall station on the IRT local track was lavished with fine architectural details, including glass tiles and large chandeliers. However, the Gustavino vaulted ceilings and skylights were lost on busy commuters, and the stop was one of the least-used in the system. It was the only station that did not have turnstiles installed by 1923, and the nearby Brooklyn Bridge stop was frequented by the express train and closer to connecting streetcars.\n\xa0\nBecause of the curved platform, cars with center doors could not be used at this station unless they had specially modified door controls which allowed just the end doors to be opened. In 1945, the station was closed when platforms along the line were being lengthened to accommodate longer trains, and the number of passengers using this stat

In [None]:
t_mod = cleanText(t)
t_mod

'first new york citi subway built oper interborough rapid transit compani irt open octob 27 1904 joy new york elev train streetcar rider citi hall station irt local track lavish fine architectur detail includ glass tile larg chandeli howev gustavino vault ceil skylight lost busi commut stop one leastus system station turnstil instal 1923 nearbi brooklyn bridg stop frequent express train closer connect streetcar curv platform car center door could use station unless special modifi door control allow end door open 1945 station close platform along line lengthen accommod longer train number passeng use station dwindl anoth factor lead declin station use fact person board train citi hall station intend destin citi hall brooklyn would wind uptown platform brooklyn bridg station would go upstair downtown platform continu journey much easier walk short distanc street level brooklyn bridg station late 1990 passeng lexington avenu local today 6 train disembark train brooklyn bridg stop longer c

In [None]:
tokens=allOperationsText(t)
tokens[0:10]

['first',
 'new',
 'york',
 'citi',
 'subway',
 'built',
 'oper',
 'interborough',
 'rapid',
 'transit']

## 2.1 Conjunctive query

In [None]:
#Mount google drive to load data
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 2.1.1 Create your index!

In [None]:
#First, we load our dataset as a Pandas DataFrame object:
df = pd.read_csv('/content/drive/MyDrive/hw3_aris/data_places.tsv', delimiter = '\t')

In [None]:
#Add a column with the description words cleaned and tokenized
df['list_words'] = df.placeDesc.apply(lambda row: allOperationsText(row))

Create a vocabulary `word_dict` to associate every word to a unique id:

In [None]:
#Import some useful libraries
from collections import Counter
from functools import reduce

In [None]:
#Create a dictionary to store all the words that appear in the tokenized descriptions with their frequency.
#It's just an auxiliary variable to help us create the real vocabulary, that is word_dict.
#The main purpose for now is to get rid of duplicates and we do it with some help from the Counter library.
vocabulary = Counter(reduce(lambda x, y: x + y, df.list_words))

#Initialize the real vocabulary
word_dict = {}

#Create the vocabulary associating to each word an integer as an id
item_id = 1
for word in vocabulary.keys():
  word_dict[word] = item_id
  item_id +=1

We saved the word dictionary just constructed in a file to avoid re-computation every time we need it. The file can be found in `./additionalcontent/word_dict.tsv` and imported when needed with the instructions:

In [None]:
f = open('/content/drive/MyDrive/hw3_aris/additionalcontent/word_dict.txt', 'r')
data = f.read()
word_dict = json.loads(data)
f.close()

Create the Inverted Index:

In [None]:
inverted_idx = {}

#To each word of vocabulary, associate a list of indexes of documents that contain that word.
#Inside the for cycle, we apply to all the rows of our dataframe a boolean condition via the lambda function that checks if the word
#is in the row. In case it is, the function gets out the row index (corresponding to the document index) and in the end returns them 
#all as a list. Finally, the inverted index associates to each term id the list thus computed.
for word, item_id in word_dict.items():
  inverted_idx[item_id] = list(df[df.list_words.apply(lambda row: word in row)].index)

To avoid computing Inverted Index every time we saved it on file, that can be found in `./additionalcontent/inverted_index.txt` and imported when needed with the instructions:

In [None]:
f = open('/content/drive/MyDrive/hw3_aris/additionalcontent/inverted_idx.txt', 'r')
data = f.read()
inverted_idx = json.loads(data)
f.close()

The function json.dumps automatically casts the keys from integer to string when writing the file, so we cast them back to int:

In [None]:
inverted_idx = {int(k): v for k, v in inverted_idx.items()}

### 2.1.2 Execute the query

Let the user input a query string:

In [None]:
query = input()

new york museum


Retrieve all the documents of interest using Inverted Index:

In [None]:
#For every relevant word of the query:
# - Retrieve its term id in the vocabulary
# - Retrieve the set of documents containing each term using Inverted Index
# - Result is a list of sets of documents
results = [set(inverted_idx[item]) for item in [word_dict[word] for word in allOperationsText(query)]]

Intermediate steps:

In [None]:
print("The processed query looks like:\n", allOperationsText(query))
print("\n")
for i in range(len(allOperationsText(query))):
 print("The word: \"", allOperationsText(query)[i], "\" is found in documents:\n", results[i])
 print("\n")

The processed query looks like:
 ['ancient', 'labyrinth']


The word: " ancient " is found in documents:
 {6144, 4102, 4104, 6152, 15, 4115, 2071, 6168, 6170, 4124, 31, 2079, 4133, 42, 2096, 49, 6195, 53, 6197, 4158, 67, 2125, 4175, 2134, 6235, 4190, 4191, 6242, 2147, 100, 106, 4205, 6263, 2170, 2172, 2173, 129, 130, 131, 2178, 6280, 4236, 6284, 4241, 2198, 6298, 2208, 164, 6310, 4275, 4277, 4279, 6327, 4281, 6332, 2240, 6336, 2247, 202, 4304, 2257, 6355, 213, 6358, 218, 2267, 4315, 224, 4321, 6371, 6379, 6388, 245, 2295, 2296, 4345, 6398, 2309, 4370, 2325, 2328, 282, 4378, 6427, 4381, 2335, 288, 289, 4386, 2339, 2346, 2355, 6456, 4410, 320, 321, 4416, 323, 2374, 328, 4425, 338, 2386, 2388, 4435, 2398, 4453, 6502, 2419, 2422, 4470, 4471, 4473, 379, 4476, 4477, 2435, 4483, 6531, 4486, 393, 4491, 407, 409, 410, 415, 2465, 2473, 6569, 4525, 6576, 6579, 4532, 445, 4541, 6589, 2499, 464, 6613, 6620, 2527, 480, 6623, 2542, 2544, 498, 500, 4596, 2553, 505, 2554, 2555, 4603, 2558, 6649, 512, 6

To realize a conjunctive query, take only the documents that contain <ins>all</ins> the words of the query:

In [None]:
#Take the intersection of all the sets of documents containing the query words
doc_idx = reduce(lambda x, y: x.intersection(y), results)

In [None]:
#Reorder by index the list of the documents containing all the query words
doc_idx = sorted(list(doc_idx))

Display the result:

In [None]:
#Show name, description and URL of those documents
df.iloc[doc_idx, [0, 5, 14]]

Unnamed: 0,placeName,placeShortDesc,placeURL
1576,Lichgate on High Road,This fairytale-inspired cottage was created by...,https://www.atlasobscura.com/places/lichgate-o...
1736,Jardins de Ca n'Altimira,Bask in the 19th century grandeur of an eccent...,https://www.atlasobscura.com/places/jardines-d...
2328,Raven's Grin Inn,Former hotel turned into haunted playground.,https://www.atlasobscura.com/places/raven-s-gr...
2335,Labyrinth at Audubon Park,The stone labyrinth in this New Orleans city p...,https://www.atlasobscura.com/places/labyrinth-...
3228,Mazzariello Labyrinth,Spiral labyrinth hidden in the hills above Oak...,https://www.atlasobscura.com/places/mazzariell...
3551,Knossos,"A minotaur, a labyrinth, and a dubious restora...",https://www.atlasobscura.com/places/knossos-crete
4435,Orvieto Underground,This picturesque Umbrian city has a subterrane...,https://www.atlasobscura.com/places/orvieto-un...
4471,Tower of Eben-Ezer,A Belgian self-built tower inspired by the Bib...,https://www.atlasobscura.com/places/tower-eben...
5429,Dover Castle,One of the few standing Roman lighthouses and ...,https://www.atlasobscura.com/places/dover-castle
6613,Labyrinthos Caves,Cretean quarry suggests real-life site for anc...,https://www.atlasobscura.com/places/labyrintho...


## 2.2 Conjunctive query & Ranking score

### 2.2.1 Tfidf and new Inverted index

**Tfidf**

We define a new Inverted Index that takes into account not only which documents contain the word, but also the tfidf score of each word with each document in which it appears.

In [None]:
#We use the sklearn library to actually compute the tfidf, in particular the TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(input = 'content', lowercase = False, tokenizer = lambda text: text, max_df = 0.5, min_df = 2)

In [None]:
result = tfidf.fit_transform(df.list_words)

In [None]:
result #result is a sparse matrix of tfidf scores (most are 0s)

<1164x9086 sparse matrix of type '<class 'numpy.float64'>'
	with 136386 stored elements in Compressed Sparse Row format>

In [None]:
result = result.todense() #we apply todense() function to make represent result as a dense matrix

In [None]:
result = result.tolist() #we flatten the values of result matrix in a single list

In [None]:
#Store as a dataframe the tfidf scores of all the words (the columns) with all the docs (the rows)
tfidf_df = pd.DataFrame(result, index = df.index, columns = tfidf.get_feature_names_out())

In [None]:
tfidf_df

Unnamed: 0,0,000,007,01,022,05,06,075,1,10,...,—i,—that,—the,—wa,—which,‘,…,…a,…and,…the
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.070784,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


These tfidf scores are invariant, so we save them to file:

In [None]:
tfidf_df.to_csv("./additionalcontent/tfidf.tsv", index = False, sep = '\t')

Then we can load it as usual:

In [None]:
tfidf_df = pd.read_csv("/content/drive/MyDrive/hw3_aris/additionalcontent/tfidf.tsv", sep = '\t')

**Inverted Index 2**

Now we can create the second Inverted Index based on the vocabulary of the TfidfVectorizer object and attaching the tfidf score.

In [None]:
inverted_idx_2 = {}

for word in list(tfidf.get_feature_names_out()):
  l = []
  for elem in inverted_idx[word_dict[word]]:
    l.append((elem, tfidf_df[word].iloc[elem]))
  inverted_idx_2[word_dict[word]] = l

Also Inverted Index 2 has been saved for later usage under /content/drive/MyDrive/hw3_aris/additionalcontent/inverted_index_2.txt.

To load it from file:

In [None]:
f = open('/content/drive/MyDrive/hw3_aris/additionalcontent/inverted_idx_2.txt', 'r')
data = f.read()
inverted_idx_2 = json.loads(data)
f.close()

Same cast as before:

In [None]:
inverted_idx_2 = {int(k): v for k, v in inverted_idx_2.items()}

### 2.2.2 Execute the query

Let the user input a query:

In [None]:
query = input()

american museum


First of all, do the same step as before to retrieve the documents that contain the query words:

In [None]:
results = [set(inverted_idx[item]) for item in [word_dict[word] for word in allOperationsText(query)]]


This time, once I have the set of documents that contain one or more words of the query, I need the *union* of those sets: in fact, each of them will in the end be represented as a vector and if it contains just some of the query words it will simply have some 0 components.

In [None]:
doc_idx_union = reduce(lambda x, y: x.union(y), results)

Check how many documents we're dealing with:

In [None]:
len(doc_idx_union)

2081

Since the TfidfVectorizer returns a smaller vocabulary than the one we saved in `word_dict`, we need to check which and how manny words of the query are actually available for tfidf ranking:

In [None]:
found_words = []
for word in allOperationsText(query):
  if word in tfidf_df.columns.values:
    found_words.append(word)
found_words

['american', 'museum']

Show the matrix weìre going to work on: each row corresponding to a document will represent a vector whose coordinates are the tfidf scores.

In [None]:
#Extracts the tfidf scores of the query words w.r.t the union of sets in which are contained
tfidf_df.iloc[sorted(list(doc_idx_union))][found_words]

Extract the previous DataFrame as an actual matrix:

In [None]:
vec_matrix = tfidf_df.iloc[sorted(list(doc_idx_union))][found_words].to_numpy()

Need to compute the tfidf of the query itself as it was an independent document:

In [None]:
vectorizer = TfidfVectorizer()
tfidf_query = vectorizer.fit_transform([query])

In [None]:
query_vec = tfidf_query.toarray().flatten()

In [None]:
query_vec

array([0.70710678, 0.70710678])

Define a function that computes the cosine similarity between two documents (passed as vectors):

In [None]:
def cos_similarity(q, d):
  #Definition:
  #Cosine Similarity(Query, Document) = Dot product(Query, Document) / ||Query|| * ||Document||
  return np.dot(q, d) / (np.linalg.norm(q, 2) * np.linalg.norm(d, 2))

Compute the similarity between the query and all the documents we're working on:

In [None]:
similarity = []

for doc_vec in vec_matrix:
  similarity.append(cos_similarity(query_vec, doc_vec))

In [None]:
len(similarity)

2081

Get the list of indices of all documents we're working on:

In [None]:
similar_places = sorted(list(doc_idx_union))

Create a Similarity column to append to DataFrame:

In [None]:
Similarity = [0] * 7200

j = 0
for i in similar_places:
  Similarity[i] = similarity[j]
  j += 1

df['Similarity'] = Similarity

We can already show the more similar results simply sorting the dataset by similarity:

In [None]:
df.sort_values(by = ['Similarity'], ascending = False).head(5)

Unnamed: 0,placeName,placeTags,numPeopleVisited,numPeopleWant,placeDesc,placeShortDesc,placeNearby,placeAddress,placeLat,placeLong,placeEditors,placePubDate,placeRelatedLists,placeRelatedPlaces,placeURL,placeLocation,list_words,Similarity
5322,Smithsonian Sushi Collection,"['food museums', 'government', 'food', 'museum...",117,531,The American History Museum has collected an a...,Seemingly unremarkable items like empty sushi ...,"['Cher Ami', ""General Sheridan's Horse Rienzi ...","14th Streed and Constitution Avenue NW, Washin...",38.8907,-77.03,"['AF', 'thebodyinthelibrary', 'hrnick']","January 31, 2017",['The Ultimate List of Wonderfully Specific Mu...,"['Gurkenmuseum (Cucumber Museum)', 'Indian Riv...",https://www.atlasobscura.com/places/smithsonia...,"Washington, D.C.","[american, histori, museum, collect, assort, s...",0.999086
2419,"Basilica of Saint Lawrence, Asheville","['explore asheville', 'basilicas', 'domes', 'c...",429,880,The turn of the 20th century was a time of pro...,This rare basilica nestled right in downtown A...,"['Asheville Pinball Museum', 'Flat Iron Sculpt...","97 Haywood Street Asheville, North Carolina Un...",35.5974,-82.5563,"['PushingUpDaisies', 'mpadwee', 'greggkimbell']","August 5, 2018",[],"['Grote Kerk', 'Iglesia de la Compañía de Jesú...",https://www.atlasobscura.com/places/basilica-o...,"Asheville, North Carolina","[turn, 20th, centuri, time, prosper, ashevil, ...",0.994969
3691,Jesse James Home Museum,"['wild west', 'crime', 'crime and punishment',...",330,681,This simple Missouri home went down in America...,The house where the famed outlaw was shot by a...,"['Amelia Earhart Birthplace Museum', 'Glore Ps...","1202 Penn St St. Joseph, Missouri, 64503 Unite...",39.7558,-94.8451,"['hana', 'ichthus']","December 10, 2014",['History Tour: Legends of the Wild West'],"['Queensland Police Museum', 'KGB Museum', 'St...",https://www.atlasobscura.com/places/jesse-jame...,"St. Joseph, Missouri","[simpl, missouri, home, went, american, histor...",0.994969
4453,Dighton Rock,['cultures and civilizations'],137,603,Dighton Rock has been the subject of curiosity...,A coastal rock outcrop with mysterious inscrip...,"['Little Neck Cemetery', 'Lizzie Borden Bed an...","Bay View Ave. Berkley, Massachusetts, 02779 Un...",41.8082,-71.1022,"['Gerard Nolan', 'Kiri the Unicorn', 'AF']","April 8, 2012",[],"['Brooks River Archaeological District', 'Onho...",https://www.atlasobscura.com/places/dighton-rock,"Berkley, Massachusetts","[dighton, rock, subject, curios, wonder, longe...",0.994969
3254,V. C. Morris Gift Shop,"['lost wonders', 'outsider architecture', 'arc...",305,744,"This building, on Maiden Lane in downtown San ...",Early prototype for Frank Lloyd Wright's Gugge...,"['Book Club of California', 'Phelan Building',...","140 Maiden Lane San Francisco, California, 941...",37.7883,-122.4059,"['godziraaa', 'mauroferri', 'herrmannfan', 'Li...","June 26, 2011",[],"[""Rustic Canyon's Murphy Ranch"", 'Flintstones ...",https://www.atlasobscura.com/places/v-c-morris...,"San Francisco, California","[build, maiden, lane, downtown, san, francisco...",0.994969


**Return the top $k$ documents**

Let the user input $k$:

In [None]:
k = int(input())

5


For efficiency reasons, we keep the most similar documents in a Max-Heap structure.

In general, a Heap is a tree-based data structure that satisfies a particular sorting property: if A is a parent of B, then the values of A and B must be sorted with respect to that property. In particular, in a Max-Heap the key of each node is greater than the keys of its children and the key of the root node is the greatest of all, i.e. the maximum. A Heap can be represented as an array respecting the following rules:
- Root in position 0
- Left child in position (2*i)+1
- Right child in position (2*i)+2
- Parent of a node in position (i-1)/2

The great advantage of using a Heap are the computational costs of basic operations. For a Max-Heap:

| Operation      | Computational cost |
| -------------- | ------------------ |
| Heapify        | $O(n)$             |
| Insert         | $O(n)$             |
| Delete         | $O(n)$             |
| Find max       | $\Theta(1)$        |

Implementation to keep the top k elements in this structure:

In [None]:
#Take a list of tuples (similarity_i, index_i)
tuple_list = []
for i in range(len(similar_places)):
  tuple_list.append((similarity[i], similar_places[i]))

In [None]:
#To simulate the Max-Heap implementation, that is not given in libraries, take the opposite of each similarity value
tuple_inv = [(-1 * e[0], e[1]) for e in tuple_list]

Start Heap processing:

In [None]:
#Heapify -> now tuple_inv IS my heap!
heapq.heapify(tuple_inv)

Define a function that pops out of the heap the tuple with max (technically min, as we took it negative) similarity value for $k$ times:

In [None]:
def top_k_idx(h, k): #takes a heap as input

  final_docs_list = []

  if k < len(tuple_inv):
    for i in range(k):
      final_docs_list.append(heapq.heappop(tuple_inv)[1]) #returns a tuple, save the index
  else:
    for i in range(len(tuple_inv)):
      final_docs_list.append(heapq.heappop(tuple_inv)[1])
  
  return final_docs_list

Finally, apply the function and get results:

In [None]:
final_l = top_k_idx(tuple_inv, k)

In [None]:
final_l

[5322, 2419, 3691, 4453, 59]

In [None]:
df.iloc[final_l, [0, 5, 14, 17]]

Unnamed: 0,placeName,placeShortDesc,placeURL,Similarity
5322,Smithsonian Sushi Collection,Seemingly unremarkable items like empty sushi ...,https://www.atlasobscura.com/places/smithsonia...,0.999086
2419,"Basilica of Saint Lawrence, Asheville",This rare basilica nestled right in downtown A...,https://www.atlasobscura.com/places/basilica-o...,0.994969
3691,Jesse James Home Museum,The house where the famed outlaw was shot by a...,https://www.atlasobscura.com/places/jesse-jame...,0.994969
4453,Dighton Rock,A coastal rock outcrop with mysterious inscrip...,https://www.atlasobscura.com/places/dighton-rock,0.994969
59,The Witch House of Salem,The only structure left with direct ties to th...,https://www.atlasobscura.com/places/witch-hous...,0.994969


In [None]:
similarity(df,'city',inverted_idx,word_dict,tfidf_df)

# 3 - Define a new score!

We decided to build our metric to rank places by the means of 6 ingredients:


1.   **Similarity** (Cosine Similraty) ∈ [0,1]
2.   **nWant** (Numbers of people who wants to visit the place) ∈ [0,1]
3.   **nVisited** (Numbers of people who visited the place) ∈ [0,1]
4.   **wInQuery** (Numbers of words of the query who apperas in the title) ∈ [0,1]
5.   **wInLocation** (Numbers of words of the query who apperas in the location) ∈ [0,1]
6.   **ntags** (Numbers of tags of the place) ∈ {0,0.025,0.06,0.08}

We combined them in the following way:

$$Score= \frac{5}{10}*Similarity + \frac{3}{10}*(0.7*wInQuery + 0.3*wInLocation)+\frac{1}{10}(nWant+nVisited)+ntags$$

First of all we can see how in the formula we weight all the values in the sum, except the one related to tags, to make the result come between 0 and 1.
We gave the highest weight to similarity since it is our starting point, also great weight possesses the case where the words of the place or location appear in the query. Finally we decided to award 0.10 each the number of people who have been there and those who would like to go there.


The reason for not including the number of tags in the weighting is that the way the function was constructed this value can only take small values (at most 0.08) and in our idea this number only serves to make sure that if after giving as input the query, two places have a very similar score between them in the other metrics of the formula, the number of tags will act as a discriminator for which of the two will appear first. In that the more tags a place has the more it should have things to visit/do.
In any case to keep the result between between 0 and 1 we have inserted an upper limit to 1.



## Functions

To arrive at the formula, we first defined some functions that would capture the needed values and create a column in the dataset with the corresponding values (`peopleWant()`,`peopleVisited()`,`wordsInQuery()`,`wordsInLocation()`,`countTags()` and `similarity()`).

In [None]:
def peopleWant(df: pd.DataFrame)-> pd.DataFrame:
  """This function adds a column regarding the normalized number of people who want to see the place
     (i.e., divided by the maximum) to the dataframe passed as input

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.

  Returns:
    - df (pd.DataFrame) : The Dataframe modified.
  """
  numbers=[] 
  for i in df['numPeopleWant']:
   numbers.append(i)
  max_W=max(numbers)
  normWant = [x / max_W for x in numbers]
  df.insert(3,'nWant', normWant)
  return   df

In [None]:
def peopleVisited(df: pd.DataFrame)-> pd.DataFrame:
  """This function adds a column regarding the normalized number of people who visited the place
     (i.e., divided by the maximum) to the dataframe passed as input

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.

  Returns:
    - df (pd.DataFrame) : The Dataframe modified.
  """
  numbers=[] 
  for i in df['numPeopleVisited']:
   numbers.append(i)
  max_V=max(numbers)
  normVisited = [x / max_V for x in numbers]
  df.insert(3,'nVisited', normVisited)
  return   df

In [None]:
def wordsInQuery(df: pd.DataFrame,query: str)-> pd.DataFrame:
  """This function does the following things:

  1. creates a list of lists (list_big), where each sub list contains the tokenized words related to the place name.

  2. creates a second list of lists (lists) where we have a sub list for each token in the query.
     In each sub list we can find the values 0 or 1 depending on whether the query word is present in the list_big related to the place

  3. Adds up all the sub lists of 'lists' and divide each element by the number of tokens in the query.

  4. Adds this vector containing the scores to the dataframe.

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.
    - query (str) : The query the filters the dataset.
  Returns:
    - df (pd.DataFrame) : The Dataframe modified.
  """
  list_big=[]
  for i in df.placeName:
    if type(i) == float:
      list_big.append('')
    else:
      list_big.append(allOperationsText(i))

  n=len(allOperationsText(query))

  lists = [[] for _ in range(n)]
  index=0
  # for each word in the tokenize query
  for w in allOperationsText(query):
    
  # we search if it appears in PlaceName
   for j in list_big:
    # if it appears we append 1
    if w in j:
      lists[index-1].append(1)
    # if it not appears we append o
    else:
      lists[index-1].append(0)
   index=index+1
  wordsinquery = [sum(l) for l in zip(*lists)]
  wordsinqueryPoints = [x/n for x in wordsinquery]

  df.insert(3,'wInQuery', wordsinqueryPoints)
  return df


In [None]:
def wordsInLocation(df: pd.DataFrame,query: str)-> pd.DataFrame:
  """This function does the following things:

  1. creates a list of lists ('list_big'), where each sub list contains the tokenized words related to the location name.

  2. creates a second list of lists ('lists') where we have a sub list for each token in the query.
     In each sub list we can find the values 0 or 1 depending on whether the query word is present in the list_big related to the location

  3. Adds up all the sub lists of 'lists' and divide each element by the number of tokens in the query.

  4. Adds this vector containing the scores to the dataframe.

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.
    - query (str) : The query the filters the dataset.
  Returns:
    - df (pd.DataFrame) : The Dataframe modified.
  """
  list_big=[]
  for i in df.placeLocation:
    if type(i) == float:
      list_big.append('')
    else:
      list_big.append(allOperationsText(i))

  n=len(allOperationsText(query))

  lists = [[] for _ in range(n)]
  index=0
  # for each word in the tokenize query
  for w in allOperationsText(query):
    
  # we search if it appears in PlaceLocation
   for j in list_big:
    # if it appears oks= 1
    if w in j:
      lists[index-1].append(1)
    # if it not appears oks= 0
    else:
      lists[index-1].append(0)
   index=index+1
  wordsinloc = [sum(l) for l in zip(*lists)]
  wordsinlocPoints = [x/max(wordsinloc) for x in wordsinloc]

  df.insert(3,'wInLocation', wordsinlocPoints)
  return df


In [None]:
def countTags(df: pd.DataFrame)-> pd.DataFrame:
 """This function adds a column which is used to reward the places who have more tags than the median: the more a place has tags, 
    the more it will have a high score that is stored in a new column in the dataframe passed as input

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.

  Returns:
    - df (pd.DataFrame) : The Dataframe modified.
 """
 numbers=[] 
 for i in df['placeTags']:
   res = i.strip('][').split(', ')
   numbers.append(len(res))
 ok=[]
 median_=np.median(numbers) # is equal to 4
 for j in numbers:

  if j == 1 or j == 2:
    ok.append(0)
  elif j == 3 or j == 4 or j == 5:
    ok.append(0.025)

  elif j == 6 or j == 7 or j == 8:
    ok.append(0.05)

  elif j == 9 or j == 10 or j == 11 or j == 12:
    ok.append(0.06)

  elif j >12:
    ok.append(0.08)

  else:
    ok.append(0)

 df.insert(3,'ntags', ok)
 return   df

In [None]:
def similarity(df, query, inverted_idx, word_dict, tfidf_df)-> pd.DataFrame:

  """This function adds the column Similraty in the dataframe df after using the others imput to compute tha column

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.
    - query (str) : The query the filters the dataset
    - word_dict (dict) : The vocabulary that contains all the term id
    - inverted_idx (dict) : The set of documents containing each term
    - tfidf_df (pd.DataFrame) : The tfidf dataframe
  Returns:
    - df (pd.DataFrame) : The Dataframe modified.
  """
  results = [set(inverted_idx[str(item)]) for item in [word_dict[word] for word in allOperationsText(query)]]
  doc_idx_union = reduce(lambda x, y: x.union(y), results)
  found_words = []
  for word in allOperationsText(query):
   if word in tfidf_df.columns.values:
    found_words.append(word)
  tfidf_df.iloc[sorted(list(doc_idx_union))][found_words]
  vec_matrix = tfidf_df.iloc[sorted(list(doc_idx_union))][found_words].to_numpy()
  vectorizer = TfidfVectorizer()
  tfidf_query = vectorizer.fit_transform([query])
  query_vec = tfidf_query.toarray().flatten()
  similarity = []

  for doc_vec in vec_matrix:
     similarity.append(cos_similarity(query_vec, doc_vec))
  similar_places = sorted(list(doc_idx_union))
  Similarity = [0] * 7200

  j = 0
  for i in similar_places:
   Similarity[i] = similarity[j]
   j += 1
  df.insert(3,'Similarity', Similarity)
  return df

Then we defined the function `getMetrics()` to create the metrics by combining the values from the various columns.

In [None]:
def getMetrics(df: pd.DataFrame, query: str, inverted_idx:dict, word_dict:dict, tfidf_df:pd.core.frame.DataFrame)-> pd.DataFrame:
  """This function takes as input a dataframe and first uses the function peopleWant(), peopleVisited() and similarity() to add the columns
     and then uses them to create the column 'Score' by a weighted average 

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.
    - query (str) : The query the filters the dataset
    - word_dict (dict) : The vocabulary that contains all the term id
    - inverted_idx (dict) : The set of documents containing each term
    - tfidf_df (pd.DataFrame) : The tfidf dataframe
  Returns:
    - df (pd.DataFrame) : The Dataframe modified.
  """
  df=similarity(df,query,inverted_idx,word_dict,tfidf_df)
  df=countTags(df)
  df=peopleWant(df)
  df=peopleVisited(df)
  df=wordsInQuery(df,query)
  df=wordsInLocation(df,query)
  #Upper_limit = lambda x: 1.0 if x>1 else x
  df['new_score']= (5/10)*df['Similarity']+ (3/10)*(0.7*(df['wInQuery'])+0.3*(df['wInLocation'])) + (1/10)*( df['nWant'] + df['nVisited']) + df['ntags']

  return df

Next `printByNewScore()` takes care of creating the dataset through the query and printing it sorted by the defined score.

In [None]:
def printByNewScore(df: pd.DataFrame, query: str, word_dict:dict, inverted_idx:dict, tfidf_df:pd.core.frame.DataFrame)-> pd.DataFrame:
  """This function takes as imput a dataframe and uses the function getMetrics() to add the 'score' column
     then uses that column to sort the dataframe filtered by the query 

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.
    - query (str) : The query the filters the dataset
    - word_dict (dict) : The vocabulary that contains all the term id
    - inverted_idx (dict) : The set of documents containing each term
    - tfidf_df (pd.DataFrame) : The tfidf dataframe
  Returns:
    - df (pd.DataFrame) : The Dataframe filtered by the query and sorted by the column score.
  """
  results = [set(inverted_idx[str(item)]) for item in [word_dict[word] for word in allOperationsText(query)]]
  doc_idx = reduce(lambda x, y: x.intersection(y), results)
  doc_idx = sorted(list(doc_idx))
  df=getMetrics(df,query,inverted_idx,word_dict,tfidf_df)
  
  # in order not to have the value of 1 exceeded, we impose this condition
  df.loc[df['new_score'] > 1, 'score'] = 1

  df= df.sort_values(by=['new_score'], ascending=False)

  return df[['placeName','placeDesc', 'placeURL', 'new_score', 'placeLat', 'placeLong', 'numPeopleVisited', 'placeAddress']]

Finally, for completeness, we built the `cleanDf()` function which allows us to clean the dataset from the columns passed in as input to start the next query without having to load the dataset again.

In [None]:
def cleanDf(df: pd.DataFrame, col1:str, col2:str, col3:str, col4:str, col5:str, col6:str, col7:str)-> pd.DataFrame:
  """This function removes from the dataset all the columns requested as imput 

  Args:
    - df (pd.DataFrame) : The Dataframe to be modified.
    - col1,...,col5 (str) : The names of columns to remove
    
  Returns:
    - df (pd.DataFrame) : The Dataframe cleaned.
  """
  try:
   del df[col1]
   del df[col2]
   del df[col3]
   del df[col4]
   del df[col5]
   del df[col6]
   del df[col7]

   return df
  except:
     print("The columns dosen't exist or are already deleted")
     return df

## Results

We proposed three different comparisons between similarity and our new score.

First we have to load from our drive folder the data:

In [None]:
df = pd.read_csv('/content/drive/MyDrive/hw3_aris/data_places.tsv', delimiter = '\t')

In [None]:
tfidf_df = pd.read_csv("/content/drive/MyDrive/hw3_aris/additionalcontent/tfidf.tsv", sep = '\t')

In [None]:
f = open('/content/drive/MyDrive/hw3_aris/additionalcontent/inverted_idx.txt', 'r')
data = f.read()
inverted_idx = json.loads(data)
f.close()

In [None]:
f = open('/content/drive/MyDrive/hw3_aris/additionalcontent/word_dict.txt', 'r')
data = f.read()
word_dict = json.loads(data)
f.close()

### First comparison

In the first comparison we are challenging the search engine with a query that does not refer to any particular place, but only generically to museums in New York:

In [None]:
g = similarity(df, 'New york museum', inverted_idx, word_dict, tfidf_df)
g.sort_values(by=['Similarity'], ascending=False)[['placeName','placeDesc', 'placeURL', 'Similarity']]

Unnamed: 0,placeName,placeDesc,placeURL,Similarity
2220,The Edward Gorey House,When the prolific illustrator and author Edwar...,https://www.atlasobscura.com/places/edward-gor...,0.998514
3572,Museum at Eldridge Street,"Between 1881 and 1924, over 2.5 million Easter...",https://www.atlasobscura.com/places/museum-at-...,0.998514
2339,Paris Sewer Museum,“Paris has another Paris under herself; a Pari...,https://www.atlasobscura.com/places/paris-sewe...,0.990018
6210,Sweet Home Cafe,Thomas Downing was the oyster king. In 19th-ce...,https://www.atlasobscura.com/places/sweet-home...,0.990018
4331,The Troll Museum,In a sixth floor walk-up in New York City’s Lo...,https://www.atlasobscura.com/places/the-troll-...,0.990018
...,...,...,...,...
3187,Pygmy Forest Trail,California’s Van Damme State Park has a strang...,https://www.atlasobscura.com/places/pygmy-fore...,0.000000
3186,Temple Bar Memorial Dragon,"In the Temple Bar area of London, just outside...",https://www.atlasobscura.com/places/temple-bar...,0.000000
3185,Pekin Noodle Parlor,"According to the U.S. Census, Chinese communit...",https://www.atlasobscura.com/places/pekin-nood...,0.000000
3182,Mosquito Bay,"At night, an eerie ethereal light lights up Mo...",https://www.atlasobscura.com/places/mosquito-bay,0.000000


We can say that the results are not very good. In fact, although the first 5 elements are all museums, only two are located at New York (Museum at Eldridge Street and The Troll Museum) so a user searching for the query in this example would not find results useful to him, despite the fact that the first 5 results have a score always higher than 0.99.


In [None]:
h = printByNewScore(df,'New york museum',word_dict,inverted_idx,tfidf_df)
h

Unnamed: 0,placeName,placeDesc,placeURL,new_score
253,New York Transit Museum,The New York Transit Museum is operated by the...,https://www.atlasobscura.com/places/new-york-t...,0.862065
6686,New York City Fire Museum,With people packed like sardines into Manhatta...,https://www.atlasobscura.com/places/new-york-c...,0.809766
1831,Panorama of the City of New York,"Constructed for the 1964 World’s Fair, the Pan...",https://www.atlasobscura.com/places/a-panorama...,0.790113
2854,New York City Police Museum,"Since 1857, the NYPD have been keeping the cit...",https://www.atlasobscura.com/places/new-york-c...,0.756524
0,City Hall Station,The first New York City subway was built and o...,https://www.atlasobscura.com/places/city-hall-...,0.744326
...,...,...,...,...
6383,Horizontal Falls,In what may be a unique phenomenon across the ...,https://www.atlasobscura.com/places/horizontal...,0.005723
7190,Casa Goofy International,"Since 1986, Casa Goofy has been a haven for tr...",https://www.atlasobscura.com/places/casa-goofy...,0.005649
7194,Taman Festival,"Eerie and overrun with lush greenery, this aba...",https://www.atlasobscura.com/places/taman-fest...,0.005572
6456,Mühlenplatz,"Spread across a bucolic German garden gallery,...",https://www.atlasobscura.com/places/muhlenplatz,0.005505


Referring to our score, we can see that all of the first results are museums and are located in new york, so it succeeds very well in its task.
because of the many components of which our metric is constructed, it is very difficult for it to get to a very high score (above 0.9) but what is important is the order of the results, and in this it succeeds well.

In [None]:
df=cleanDf(df,"nWant","nVisited","ntags","Similarity","new_score","wInQuery","wInLocation")

### Second comparison# 

In the second comparision we are using a query that refers to a particular place

In [None]:
g=similarity(df, 'Shangri La Botanical Gardens & Nature Center', inverted_idx, word_dict, tfidf_df)
positions=range(1,7201)
g=g.sort_values(by=['Similarity'], ascending=False)[['placeName','placeDesc', 'placeURL', 'Similarity']]
g.insert(1,'position',positions)
g

Unnamed: 0,placeName,position,placeDesc,placeURL,Similarity
3340,Estufa Fria,1,"At the start of the 20th century, what is now ...",https://www.atlasobscura.com/places/estufa-fri...,0.718216
4066,Braden Castle Ruins,2,"In the early 1840s, two brothers, Dr. Joseph A...",https://www.atlasobscura.com/places/braden-cas...,0.705405
1575,The Lodge at Hot Lake Springs,3,This turn-of-the-century resort attracted visi...,https://www.atlasobscura.com/places/the-lodge-...,0.705405
6350,Wat Pha Sorn Kaew (Temple on the Glass Cliff),4,Hidden away high up in the hills of north cent...,https://www.atlasobscura.com/places/wat-phar-s...,0.705405
1180,Habitat 67,5,"When you go to Montreal, there’s a strange bui...",https://www.atlasobscura.com/places/habitat-67,0.705405
...,...,...,...,...,...
3824,V&A Museum of Childhood,7196,Originally founded as the Bethnal Green Museum...,https://www.atlasobscura.com/places/v-and-a-ch...,0.000000
3823,Power Plant IM,7197,In a small neighborhood known as Monceau-sur-S...,https://www.atlasobscura.com/places/power-plan...,0.000000
3822,Ruins of the Chapman Beverley Mill,7198,This mill goes back–way back. Before the Revol...,https://www.atlasobscura.com/places/ruins-of-t...,0.000000
3821,The Hawks Nest,7199,New York State Route 97 traces the serpentine ...,https://www.atlasobscura.com/places/the-hawks-...,0.000000


In [None]:
g[g['placeName']=='Shangri La Botanical Gardens & Nature Center'][['position', 'Similarity']]

Unnamed: 0,position,Similarity
6043,15,0.691921


The results of similarity are bad, in fact the place in question does not appear in the top 5 results and furthermore these all possess very high score.
The right place is found only 15th with a score not too far from the top 5

In [None]:
h= printByNewScore(df,'Shangri La Botanical Gardens & Nature Center',word_dict,inverted_idx,tfidf_df)
h

Unnamed: 0,placeName,placeDesc,placeURL,new_score
6043,Shangri La Botanical Gardens & Nature Center,“…I determined to gather together all things o...,https://www.atlasobscura.com/places/shangri-la...,0.555984
1635,Ethel M Botanical Cactus Garden,"Ethel M is Ethel Mars, the matriarch of the fa...",https://www.atlasobscura.com/places/ethel-m-bo...,0.489157
1575,The Lodge at Hot Lake Springs,This turn-of-the-century resort attracted visi...,https://www.atlasobscura.com/places/the-lodge-...,0.483552
1092,Los Angeles Zoo Botanical Gardens,Most zoos have some sort of native botanical l...,https://www.atlasobscura.com/places/los-angele...,0.479705
5267,Setenil De Las Bodegas,Setenil de Las Bodegas seamlessly marries man ...,https://www.atlasobscura.com/places/setenil-de...,0.469958
...,...,...,...,...
6708,Musee Robert Tatin,Robert Tatin was the third in a line of French...,https://www.atlasobscura.com/places/musee-robe...,0.005695
7190,Casa Goofy International,"Since 1986, Casa Goofy has been a haven for tr...",https://www.atlasobscura.com/places/casa-goofy...,0.005649
6995,Metropolitan Pit Stop,Metropolitan Pit Stop was founded by Jimmy Val...,https://www.atlasobscura.com/places/metropolit...,0.005609
7151,Draugasetrid,Draugasetrið is located in the small village o...,https://www.atlasobscura.com/places/draugasetrid,0.005544


With our metric instead the right place appears immediately in the first position, despite having a lower value than in the similarity case our score accomplishes its task of correctly ordering the values

In [None]:
df=cleanDf(df,"nWant","nVisited","ntags","Similarity","new_score","wInQuery","wInLocation")

### Third comparison# 

In the last comparison we want to see how the engine behaves with a very simple query like 'beach' in which no particular place appears

In [None]:
g=similarity(df, 'beach', inverted_idx, word_dict, tfidf_df)
g.sort_values(by=['Similarity'], ascending=False)[['placeName','placeDesc', 'placeURL', 'Similarity']]

Unnamed: 0,placeName,placeDesc,placeURL,Similarity
1891,National World War II Museum,"Perhaps once thought too narrowly focused, thi...",https://www.atlasobscura.com/places/national-w...,1.0
6836,Playland Not At the Beach,"Housed in a former grocery store, a modest ext...",https://www.atlasobscura.com/places/playland-n...,1.0
6888,Gajumaru Treehouse Diner,"When the organic, happenstance beauty of natur...",https://www.atlasobscura.com/places/gajumaru-t...,1.0
6889,Styx Valley Forest,Near the Tasmanian Wilderness World Heritage S...,https://www.atlasobscura.com/places/styx-valle...,1.0
2149,Fort Matanzas National Monument,Florida was settled much earlier by Europeans ...,https://www.atlasobscura.com/places/fort-matan...,1.0
...,...,...,...,...
2426,Van Sant Crybaby Bridge,"As the legend goes, many years ago a young wom...",https://www.atlasobscura.com/places/van-sant-c...,0.0
2425,Antelope Island on the Great Salt Lake,Named for the species of Pronghorn Antelope th...,https://www.atlasobscura.com/places/antelope-i...,0.0
2424,Chocolate Waterfall,Located directly inside the front door of the ...,https://www.atlasobscura.com/places/chocolate-...,0.0
2423,Mount Greylock,Visitors to the mountain can either drive to t...,https://www.atlasobscura.com/places/mount-grey...,0.0


Despite the fact that there is no place in articular that should appear first since 'beach' is very generic we can see that the similarity puts in first position a museum about the Second World War located in Louisiana, so it mistakes again 

In [None]:
h= printByNewScore(df,'beach',word_dict,inverted_idx,tfidf_df)
h

Unnamed: 0,placeName,placeDesc,placeURL,new_score
3369,Crescent Beach,"Hidden behind two regal viewing points, Cresce...",https://www.atlasobscura.com/places/crescent-b...,0.878856
5952,Bombay Beach,"In 2010, the United States Census Bureau measu...",https://www.atlasobscura.com/places/bombay-beach,0.843112
361,Victoria Beach's Pirate Tower,This enigmatic seaside tower on California’s V...,https://www.atlasobscura.com/places/victoria-b...,0.830159
374,Hidden Beach,A gaping hole in the surface of the lush green...,https://www.atlasobscura.com/places/hidden-bea...,0.787664
975,Driftwood Beach,Jekyll Island—Georgia state’s smallest barrier...,https://www.atlasobscura.com/places/driftwood-...,0.787650
...,...,...,...,...
7151,Draugasetrid,Draugasetrið is located in the small village o...,https://www.atlasobscura.com/places/draugasetrid,0.005544
6456,Mühlenplatz,"Spread across a bucolic German garden gallery,...",https://www.atlasobscura.com/places/muhlenplatz,0.005505
7024,Museum of Human Anatomy,"Tucked all over Italy, housed in universities,...",https://www.atlasobscura.com/places/museum-hum...,0.005475
6802,Kummakivi Balancing Rock,The Kummakivi (Finnish for “strange rock”) can...,https://www.atlasobscura.com/places/kummakivi-...,0.005472


In our case instead, all the first 5 results contain the word in the query

In [None]:
df=cleanDf(df,"nWant","nVisited","ntags","Similarity","new_score","wInQuery","wInLocation")

Taking into account all the comparisons and what has already been said above, our score performed particularly well in the various proposed situations, so our additions on the metric calculation were important, especially the wInQuery result rank.

# 4 - Visualizing the most relevant places.
For this question, we will visualize the top 30 places with the query "beach" according to the similarity score of question 3

In [None]:
# sort the dataframe based on the similarity of the new score
h = printByNewScore(df,'beach',word_dict,inverted_idx,tfidf_df)

In [None]:
# select first 30 rows of the dataset with query beach
df_places = h.iloc[0:30]
df_places

Unnamed: 0,placeName,placeDesc,placeURL,new_score,placeLat,placeLong,numPeopleVisited,placeAddress
3369,Crescent Beach,"Hidden behind two regal viewing points, Cresce...",https://www.atlasobscura.com/places/crescent-b...,0.878856,45.9134,-123.9695,693,"Cannon Beach, Oregon United States\n"
5952,Bombay Beach,"In 2010, the United States Census Bureau measu...",https://www.atlasobscura.com/places/bombay-beach,0.843112,33.3506,-115.7296,834,"Bombay Beach, California, 92257 United States\n"
361,Victoria Beach's Pirate Tower,This enigmatic seaside tower on California’s V...,https://www.atlasobscura.com/places/victoria-b...,0.830159,33.5215,-117.7645,335,"2713 Victoria Dr Laguna Beach, California, 926..."
374,Hidden Beach,A gaping hole in the surface of the lush green...,https://www.atlasobscura.com/places/hidden-bea...,0.787664,20.704,-105.5649,195,Islas Marietas Mexico\n
975,Driftwood Beach,Jekyll Island—Georgia state’s smallest barrier...,https://www.atlasobscura.com/places/driftwood-...,0.78765,31.1034,-81.404,765,"1198 Riverview Drive Jekyll Island, Georgia, 3..."
5735,Muscle Beach,"If you have ever watched TV, you have seen a c...",https://www.atlasobscura.com/places/muscle-bea...,0.784099,33.9854,-118.4727,2886,"1800 Ocean Front Walk Los Angeles, California,..."
1330,Rialto Beach Tree Graveyard,At the mouth of the Quillayute River in La Pus...,https://www.atlasobscura.com/places/tree-grave...,0.78304,47.9173,-124.6394,596,"Rialto Beach La Push, Washington, 98331 Unite..."
194,Glass Beach,"In the early 20th century, Fort Bragg resident...",https://www.atlasobscura.com/places/glass-beach,0.782368,39.453,-123.8136,1088,"Glass Beach Trail Fort Bragg, California, 9543..."
950,Clam Pass Beach,"On the coast of Naples, Florida, where a fores...",https://www.atlasobscura.com/places/clam-pass-...,0.780276,26.2173,-81.8176,256,"Seagate Dr and Crayton Road Naples, Florida Un..."
2874,Papakōlea Beach,About six miles from the South Point is Papakō...,https://www.atlasobscura.com/places/green-beac...,0.776163,18.9361,-155.6465,458,"93-1206 S Point Rd Naalehu, Hawaii, 96772 Unit..."


In [None]:
# adjust the names for visualization in the map of world
df_places.rename(columns={'numPeopleVisited': 'Number people visited', 'placeAddress': 'Address', 'placeLat' : 'lat', 'placeLong' : 'lon', 'new_score' : 'Similarity'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


The cells below are used to extract the city and country from the lon and lat coordinates. The city and country is added to the dataframe as a separate column. These will be used for the visualization.

In [None]:
# FUNCTIONS TO DETERMINE CITY AND COUNTRY BASED ON LAT, LON

# mapping function to map the new column
def determine_city(lon, lat):
  stringcoor = str(lat)+", " +str(lon)
  return geolocator.reverse(stringcoor).raw['address'].get('city', '')

  # mapping function to map the new column
def determine_country(lon, lat):
  stringcoor = str(lat)+", " +str(lon)
  return geolocator.reverse(stringcoor).raw['address'].get('country', '')

In [None]:
# initialize Nominatim API
geolocator = Nominatim(user_agent="geoapiExercises")


df_places['City'] = df_places.apply(lambda x: determine_city(lon = x['lon'], lat = x['lat']), axis=1)
df_places['Country'] = df_places.apply(lambda x: determine_country(lon = x['lon'], lat = x['lat']), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


**Visualizing the places**

With the map below, the places of the search are visualized. It is possible to zoom in using the mouse. If the place does not show a city, it is because of the fact that the place is not located in a city, but somewhere outside a city (e.g. nature area).

In [None]:
# Website with functions
# https://plotly.com/python-api-reference/generated/plotly.express.scatter_geo.html


# For our visualization:
# size = similarity
# hover_name = name 
# color = number people visited
# text = [country, city, address]
# projection="robinson"


# determine dataframe used for plotting
df = df_places
fig = px.scatter_geo(df, lon = 'lon', lat = 'lat', color="Similarity",
                     hover_name="placeName", size="Number people visited",
                     hover_data  = ['Country', 'City', 'Address'],
                     size_max = 18,   # sets maximum size of the points
                     projection="robinson", # type of map
                     title = ' Top 30 places based on the search. The colour indicates the similarity. The size indicates how many people have visited')

fig.update_traces(textposition="top center",
                  mode='markers')

fig.show()

# 6 - Command Line Question

**Data Preparation**

Before start processing the dataset via command line we need to clean the main textual fields from newlines and other special characters that may affect the file format. So we create a copy of our dataframe with clean text in the fields Description and Short Description, and save it to a file. This is the file is we're going to work on in the bash script.

In [None]:
#Load dataframe
df_to_clean = pd.read_csv('/content/drive/MyDrive/hw3_aris/data_places.tsv', delimiter = '\t')

In [None]:
#Copy dataframe
df_clean = df_to_clean.copy()

In [None]:
#Clean description and short description text
df_clean.placeDesc = df_clean.placeDesc.apply(lambda text: text.lstrip().rstrip().replace("\n", "").replace("\xa0", ""))
df_clean.placeShortDesc = df_clean.placeShortDesc.apply(lambda text: text.lstrip().rstrip().replace("\n", "").replace("\xa0", ""))

In [None]:
#Save the clean dataframe
df_clean.to_csv("./additionalcontent/data_places_new.csv", sep = '\t', index = False)