In [1]:
import pandas as pd
import re
import haversine

In [2]:
df = pd.read_csv('Dataset/airbnb.csv')
df.city.unique()

array(['Chicago', 'NYC', 'LA', 'SF', 'DC', 'Boston'], dtype=object)

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_wikipedia_table(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f'Failed to retrieve the page, status code: {response.status_code}')
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table', {'class': 'wikitable'})

    if len(tables) == 0:
        print('No "wikitable" found on this page.')
        return None
    table = tables[0]
    headers = []
    for th in table.find_all('th'):
        headers.append(th.get_text(strip=True))
    rows = []
    for tr in table.find_all('tr')[1:]:
        cols = tr.find_all('td')
        row = [col.get_text(strip=True) for col in cols]
        if row: 
            rows.append(row)
    df = pd.DataFrame(rows, columns=headers)
    return df


In [4]:
regex = r'(\d{1,3}°\d{1,2}′\d{1,2}″[NS])\s+(\d{1,3}°\d{1,2}′\d{1,2}″[EW])'


In [5]:
boston = pd.read_csv('Dataset/landmarks/Boston.csv')
boston[['latitude', 'longitude']] = boston['Location'].str.extract(regex)
print(boston.shape)
boston = boston.dropna(subset=['latitude', 'longitude'])
print(boston.shape)
print(boston[['latitude', 'longitude']].head())

(57, 8)
(57, 8)
     latitude   longitude
0  42°21′36″N  71°03′56″W
1  42°21′25″N  71°03′59″W
2  42°17′52″N  71°07′22″W
3  42°20′57″N  71°05′25″W
4  42°21′30″N  71°03′58″W


In [6]:
chicago = pd.read_csv('Dataset/landmarks/Chicago.csv')
chicago[['latitude', 'longitude']] = chicago['Location'].str.extract(regex)
print(chicago.shape)
chicago = chicago.dropna(subset=['latitude', 'longitude'])
print(chicago.shape)
print(chicago[['latitude', 'longitude']].head())

(450, 10)
(132, 10)
     latitude   longitude
1  41°52′47″N  87°38′09″W
2  41°53′16″N  87°37′27″W
3  41°53′11″N  87°37′36″W
4  41°56′51″N  87°39′00″W
5  41°46′53″N  87°34′26″W


In [7]:
DC = pd.read_csv('Dataset/landmarks/DC.csv')
DC[['latitude', 'longitude']] = DC['Location'].str.extract(regex)
print(DC.shape)
DC = DC.dropna(subset=['latitude', 'longitude'])
print(DC.shape)
print(DC[['latitude', 'longitude']].head())

(76, 8)
(76, 8)
     latitude   longitude
0  38°54′05″N  77°02′46″W
1  38°54′39″N  77°02′07″W
2  38°54′12″N  77°01′28″W
3  38°53′58″N  77°02′19″W
4  38°54′39″N  77°02′53″W


In [8]:
LA = pd.read_csv('Dataset/landmarks/LA.csv')
LA[['latitude', 'longitude']] = LA['Location'].str.extract(regex)
print(LA.shape)
LA = LA.dropna(subset=['latitude', 'longitude'])
print(LA.shape)
print(LA[['latitude', 'longitude']].head())

(106, 8)
(105, 8)
     latitude    longitude
0  34°02′04″N  118°40′51″W
1  34°05′26″N  117°44′35″W
2  34°16′11″N  118°10′06″W
3  34°03′26″N  118°14′16″W
4  33°47′23″N  118°15′32″W


In [9]:
NYC = pd.read_csv('Dataset/landmarks/NYC.csv')
NYC[['latitude', 'longitude']] = NYC['Location'].str.extract(regex)
print(NYC.shape)
NYC = NYC.dropna(subset=['latitude', 'longitude'])
print(NYC.shape)
print(NYC[['latitude', 'longitude']].head())

(116, 9)
(116, 9)
     latitude   longitude
0  40°44′30″N  73°59′01″W
1  40°53′32″N  73°51′57″W
2  40°42′52″N  74°00′16″W
3  40°42′17″N  74°00′09″W
4  40°42′32″N  74°00′45″W


In [10]:
SF = pd.read_csv('Dataset/landmarks/SF.csv')
SF[['latitude', 'longitude']] = SF['Description'].str.extract(regex)
print(SF.shape)
SF = SF.dropna(subset=['latitude', 'longitude'])
print(SF.shape)
print(SF[['latitude', 'longitude']].head())

(309, 8)
(87, 8)
     latitude    longitude
0  37°45′51″N  122°25′36″W
1  37°47′34″N  122°24′21″W
2  37°47′36″N  122°24′06″W
3  37°47′08″N  122°24′13″W
4  37°47′56″N  122°24′28″W


In [11]:
def dms_to_decimal(dms):
    match = re.match(r'(\d+)°(\d+)′(\d+)″([NSWE])', dms)
    
    if match:
        degrees, minutes, seconds, direction = match.groups()
        decimal = int(degrees) + int(minutes)/60 + int(seconds)/3600
        if direction in ['S', 'W']:
            decimal = -decimal
        return decimal
    else:
        raise ValueError(f'Invalid DMS format: {dms}')

In [12]:
boston['latitude_decimal'] = boston['latitude'].apply(dms_to_decimal)
boston['longitude_decimal'] = boston['longitude'].apply(dms_to_decimal)

chicago['latitude_decimal'] = chicago['latitude'].apply(dms_to_decimal)
chicago['longitude_decimal'] = chicago['longitude'].apply(dms_to_decimal)

DC['latitude_decimal'] = DC['latitude'].apply(dms_to_decimal)
DC['longitude_decimal'] = DC['longitude'].apply(dms_to_decimal)

LA['latitude_decimal'] = LA['latitude'].apply(dms_to_decimal)
LA['longitude_decimal'] = LA['longitude'].apply(dms_to_decimal)

LA['latitude_decimal'] = LA['latitude'].apply(dms_to_decimal)
LA['longitude_decimal'] = LA['longitude'].apply(dms_to_decimal)

NYC['latitude_decimal'] = NYC['latitude'].apply(dms_to_decimal)
NYC['longitude_decimal'] = NYC['longitude'].apply(dms_to_decimal)

SF['latitude_decimal'] = SF['latitude'].apply(dms_to_decimal)
SF['longitude_decimal'] = SF['longitude'].apply(dms_to_decimal)

In [None]:
def average_distance_from_landmark(lat, lon, landmark_df: pd.DataFrame):
    distances = landmark_df.apply(lambda row: haversine((row['latitude_decimal'], row['longitude_decimal']), (lat, lon)), axis=1)
    return distances.mean()

In [24]:
df[df['city'] == 'Boston'][['longitude', 'latitude']]


Unnamed: 0,longitude,latitude
57,-71.094858,42.344638
77,-71.068762,42.344462
85,-71.097457,42.330617
93,-71.098365,42.344687
140,-71.114651,42.322172
...,...,...
4933,-71.058864,42.281319
4940,-71.069405,42.359137
4942,-71.065372,42.349997
4965,-71.102395,42.350122


In [27]:
df[df['city'] == 'Boston']['average_distance_to_landmarks'] = df[df['city'] == 'Boston'].apply(calculate_avg_distance, landmarks=boston, axis=1)

TypeError: 'module' object is not callable