## 1. Importing the requisite packages

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import geoplot as gplt
import contextily

In [None]:
# Setting the data path
data_path = r'YOUR FILE PATH'

## 2. Creating Summary Spatial Features - One Dataset

In [None]:
# Reading in the data from the path
locs_pdf = pd.read_csv(data_path + 'OSM_DollarGeneralLocs.csv')

# Converting the pandas dataframe into a geopandas geodataframe
locs_gdf = gpd.GeoDataFrame(
    locs_pdf, geometry=gpd.points_from_xy(locs_pdf.X, locs_pdf.Y),
    crs="EPSG:4326"
)

# Resetting the index and creating a synthetic ID field
locs_gdf.reset_index(inplace=True)
locs_gdf.rename(columns={'index':'ID'}, inplace=True)

In [None]:
# To create a buffer, we first need to convert from a g-crs to a p-crs
locs_gdf = locs_gdf.to_crs(3005)

# Next, create aggregation area around each store
buffer_size_mi = 5
buffer_size_m = buffer_size_mi * 1609.344 # meters in a mile

# Creating a copy of the original dataframe to operate on
locs_gdf_buffer = locs_gdf.copy()

# Performing the buffer operation
locs_gdf_buffer["buffer_5mi"] = locs_gdf.buffer(buffer_size_m)

locs_gdf_buffer[['ID','geometry','X','Y','buffer_5mi']].head()

In [None]:
# Joining the buffer to the store locations table
joined = gpd.sjoin(
    
    # Right table is the raw store locations data
    locs_gdf,
    # Left table is that of the buffers around the stores
    locs_gdf_buffer.set_geometry("buffer_5mi")[["ID", "buffer_5mi"]],
    # The operation, or spatial predicate, you'll use is `within`
    predicate="within"
)

In [None]:
# store count
store_count = (
    joined.groupby(
        "ID_left"
    )
    .count()
)

# Converting to a dataframe and cleaning up
store_count_df = store_count.reset_index()
store_count_df = store_count_df[['ID_left','ID_right']]
store_count_df.columns=['ID','Store_Count']

store_count_df.head()

In [None]:
# Changing CRS to make mapping cleaner
locs_gdf_buffer = locs_gdf_buffer.set_geometry("buffer_5mi")[["ID", "buffer_5mi"]]
locs_gdf_buffer = locs_gdf_buffer.to_crs(4326)
locs_gdf = locs_gdf.to_crs(4326)

# Set up figure and axis
f, ax = plt.subplots(1, figsize=(12, 12))

# Plot Buffer around Store ID 45 in green
locs_gdf_buffer[locs_gdf_buffer['ID']==45].plot(ax=ax,color="g")

# Plot all stores in red
locs_gdf.plot(ax=ax, color="r")

# Plot store ID 2 in blue
locs_gdf[locs_gdf['ID']==45].plot(ax=ax,color="b")

# Add Stamen's Toner basemap
contextily.add_basemap(
    ax,
    crs=locs_gdf.crs.to_string(),
    source=contextily.providers.Stamen.Toner,
)
# Remove axes
ax.set_axis_off()
# Display
plt.show()

## 3. Creating Summary Spatial Features - Two Dataset

In [None]:
# Reading in the file
c_pdf = pd.read_csv(data_path + 'OSM_FamilyDollarLocs.csv')
# Converting the pandas dataframe into a geopandas geodataframe
c_gdf = gpd.GeoDataFrame(
    c_pdf, geometry=gpd.points_from_xy(c_pdf.X, c_pdf.Y), crs = "EPSG:4326"
)
# Converting to a p-CRS
c_gdf = c_gdf.to_crs(3005)
# Dropping records without valid geometries
c_gdf = c_gdf[~(c_gdf['geometry'].is_empty | c_gdf['geometry'].isna())]

In [None]:
# Reseting the index and creating a synthetic ID field
c_gdf.reset_index(inplace=True)
c_gdf.rename(columns={'index':'ID'}, inplace=True)
# Cleaning up the data to just being those stores in Ohio
Ohio = gpd.read_file("https://www2.census.gov/geo/tiger/TIGER2021/STATE/tl_2021_us_state.zip")
Ohio = Ohio[Ohio['STUSPS']=="OH"]
Ohio = Ohio.to_crs(3005)
c_gdf = gpd.overlay(c_gdf, Ohio, how='intersection')

In [None]:
# Changing to a p-crs for the buffer file
locs_gdf_buffer = locs_gdf_buffer.to_crs(3005)
# Joining the buffer to the store locations table
joined = gpd.sjoin(   
    # Right table is the competitor stores
    c_gdf,
    # Left table is that of the buffers around the primary company's Stores
    locs_gdf_buffer.set_geometry("buffer_5mi")[["ID", "buffer_5mi"]],
    # The operation, or spatial predicate, you'll use is `within`
    predicate="within"
)

In [None]:
# store count
store_count = (
    joined.groupby(
        "ID_left"
    )
    .count()
)
# Converting to a dataframe and cleaning up
store_count_df = store_count.reset_index()
store_count_df = store_count_df[['ID_left','ID_right']]
store_count_df.columns=['ID','Comp_Store_Count']
# Displaying the data
store_count_df.head()

## 4. Creating Proximity Spatial Features - NY Airbnb Dataset
We'll now turn our attention back to the NYC Airbnb Dataset to calculate proximity spatial features

In [None]:
# Reading in the data
# REMINDER - The listings data must be downloaded from Inside Airbnb
listings = pd.read_csv(data_path + r'NY Airbnb June 2020\listings.csv.gz', compression='gzip', low_memory=False)

# Converting it to a GeoPandas DataFrame
listings_gpdf = gpd.GeoDataFrame(
    listings,
    geometry=gpd.points_from_xy(listings['longitude'],
                                   listings['latitude'],
                                   crs="EPSG:4326")
)

In [None]:
# Focusing on attractions in Manhattan, so we need to create a mask to filter locations 
# in the Manhattan borough
boroughs = gpd.read_file(data_path + r"NYC Boroughs\nybb_22a\nybb.shp")
manhattan = boroughs[boroughs['BoroName']=='Manhattan']
manhattan = manhattan.to_crs('EPSG:4326')

In [None]:
# Creating a mask
listings_mask = listings_gpdf.within(manhattan.loc[3, 'geometry'])
# Using the mask to filter the data
listings_manhattan = listings_gpdf.loc[listings_mask]
listings_manhattan.head()

In [None]:
# Set up figure and axis
f, ax = plt.subplots(1, figsize=(10, 10))

# Plot all airbnb locations in green
listings_manhattan.plot(ax=ax, color="g")

# Add Stamen's Toner basemap
contextily.add_basemap(
    ax,
    crs=listings_manhattan.crs.to_string(),
    source=contextily.providers.Stamen.Watercolor
)

# Remove axes
ax.set_axis_off()

# Display
plt.show()

In [None]:
# Reading in data on popular NYC Attractions
nyc_attr = pd.read_csv(data_path + 'NYC Attractions\\NYC Attractions.csv')
# Convert PDF to GPDF
nyc_attr_gpdf =  gpd.GeoDataFrame(
    nyc_attr,
    geometry=gpd.points_from_xy(nyc_attr['Longitude'],
                                   nyc_attr['Latitude'],
                                   crs="EPSG:4326")
)
# Displaying the top 5 rows of the table
nyc_attr_gpdf.head()

In [None]:
# Set up figure and axis
f, ax = plt.subplots(1, figsize=(10, 10))
# Plot all attractions in blue
nyc_attr_gpdf.plot(ax=ax, color="b")
# Add Stamen's Toner basemap
contextily.add_basemap(
    ax,
    crs=nyc_attr_gpdf.crs.to_string(),
    source=contextily.providers.Stamen.Watercolor
)
# Remove axes
ax.set_axis_off()
# Display the plot
plt.show()

In [None]:
from matplotlib.lines import Line2D
# Set up figure and axis
f, ax = plt.subplots(1, figsize=(10, 10))
# Plot all airbnb locations in green
listings_manhattan.plot(ax=ax, color="g")
# Plot all attractions in blue
nyc_attr_gpdf.plot(ax=ax, color="b")
# Add Stamen's Toner basemap
contextily.add_basemap(
    ax,
    crs=nyc_attr_gpdf.crs.to_string(),
    source=contextily.providers.Stamen.Watercolor
)

# Remove axes
ax.set_axis_off()
# Manually creating a legend to orient audience
green_circle = Line2D([0], [0], marker='o', color='w', label='Airbnbs',
                        markerfacecolor='g', markersize=8)
blue_circle = Line2D([0], [0], marker='o', color='w', label='Attractions',
                        markerfacecolor='b', markersize=8)
plt.legend(handles=[green_circle, blue_circle])
# Display
plt.show()

In [None]:
# Calculate the distance to each attraction per airbnb
attractions = nyc_attr_gpdf.Attraction.unique()
# Converting to a projected coordinate system
nyc_attr_gpdf_p = nyc_attr_gpdf.to_crs('EPSG:2263')
listings_manhattan_p = listings_manhattan.to_crs('EPSG:2263')
# Applying a lambda function that calls geopandas distance function to calcuate the distance between each airbnb and each attraction
distances = listings_manhattan_p.geometry.apply(lambda g: nyc_attr_gpdf_p.distance(g)).head()
# Renaming the columns based on the attraction for which the distance is calculated
distances.columns = attractions
# Displaying the top 5 rows of the dataframe
distances.head()

In [None]:
# To understand what the distance unit is, we run the following function
listings_manhattan_p.crs.axis_info[0].unit_name

In [None]:
# Convert from 'US survey foot' to miles
distances = distances.apply(lambda x: x/5280, axis=1)
distances.head()

In [None]:
# Check to see which locations are less than 2 miles

distances_1mi = distances.apply(lambda x: x <=1, axis=1).sum(axis=1)
distances_2mi = distances.apply(lambda x: x <=2, axis=1).sum(axis=1)
distances_3mi = distances.apply(lambda x: x <=3, axis=1).sum(axis=1)
distances_4mi = distances.apply(lambda x: x <=4, axis=1).sum(axis=1)
distances_5mi = distances.apply(lambda x: x <=5, axis=1).sum(axis=1)
distances_6mi = distances.apply(lambda x: x <=6, axis=1).sum(axis=1)

# Creating a dataframe combining all the distance bands
distance_df = pd.concat([distances_1mi,distances_2mi,distances_3mi,distances_4mi,distances_5mi,distances_6mi], axis=1)
distance_df.columns = ['Attr_1mi','Attr_2mi','Attr_3mi','Attr_4mi','Attr_5mi','Attr_6mi']
distance_df.head()

In [None]:
# Joining back to the listings geopandas df
listings_manhattan = listings_manhattan.merge(distances, left_index=True, right_index=True)
listings_manhattan = listings_manhattan.merge(distance_df, left_index=True, right_index=True)

#listings_manhattan.head()

In [None]:
listings_manhattan.head()