https://opendata.vancouver.ca/explore/?disjunctive.features&disjunctive.theme&disjunctive.keyword&disjunctive.data-owner&disjunctive.data-team&sort=modified

https://opendata.vancouver.ca/api/v2/console

In [1]:
#Import packages

import numpy as np
import pandas as pd
import geopandas as gpd
import requests
import seaborn as sns
import matplotlib.pyplot as plt

## Working With Building Permits Data

In [2]:


url = 'https://opendata.vancouver.ca/api/v2/catalog/datasets/issued-building-permits/exports/geojson?limit=-1&offset=0&timezone=UTC'
data = requests.get(url)
permits_df = gpd.GeoDataFrame().from_features(data.json())
permits_df = permits_df.set_crs('EPSG:4326').to_crs('EPSG:3857') # Set all geopandas dataframes to the same coordinate reference system

In [3]:
permits_df.head()

Unnamed: 0,geometry,permitnumber,permitnumbercreateddate,issuedate,permitelapseddays,projectvalue,typeofwork,address,projectdescription,permitcategory,...,applicantaddress,propertyuse,specificusecategory,buildingcontractor,buildingcontractoraddress,issueyear,geolocalarea,yearmonth,geo_point_2d,geom
0,POINT (-13703478.999 6317938.424),BP-2017-00906,2017-02-16,2017-11-22,279,8470000.0,New Building,"209 E 20TH AVENUE, Vancouver, BC V1V 1V1",Certified Professional Program - New Building ...,,...,"2479 Kingsway\r\nVancouver, BC V5R 5G8",[Dwelling Uses],[Multiple Dwelling],,,2017,Riley Park,2017-11,"{'lon': -123.1004463, 'lat': 49.2532268}",
1,POINT (-13705149.292 6311700.168),BP-2018-02868,2018-05-30,2018-09-12,105,0.0,New Building,"7527 YUKON STREET, Vancouver, BC V1V 1V1",Certified Professional Program - New Building ...,,...,"2479 Kingsway\r\nVancouver, BC V5R 5G8",[Dwelling Uses],[Multiple Dwelling],,,2018,Marpole,2018-09,"{'lon': -123.1154508, 'lat': 49.2166355}",
2,POINT (-13705255.123 6311745.602),BP-2018-02864,2018-05-30,2018-09-12,105,0.0,New Building,"478 W 59TH AVENUE, Vancouver, BC V1V 1V1",Certified Professional Program - New Building ...,,...,"2479 Kingsway\r\nVancouver, BC V5R 5G8",[Dwelling Uses],[Multiple Dwelling],,,2018,Marpole,2018-09,"{'lon': -123.1164015, 'lat': 49.2169021}",
3,POINT (-13698193.338 6315946.354),BP-2019-05356,2019-11-26,2020-04-14,140,0.0,New Building,"4795 SLOCAN STREET, Vancouver, BC V5R 2A2",Certified Professional Program - New Building ...,,...,"2479 Kingsway\r\nVancouver, BC V5R 5G8",[Dwelling Uses],[Multiple Dwelling],,,2020,Renfrew-Collingwood,2020-04,"{'lon': -123.0529644, 'lat': 49.241545}",
4,POINT (-13706737.832 6311648.292),BP-2019-05460,2019-12-02,2020-09-18,291,0.0,New Building,"7563 OAK STREET, Vancouver, BC V6P 4A4",Certified Professional Program - New Building ...,,...,"2479 Kingsway\r\nVancouver, BC V5R 5G8","[Dwelling Uses, Parking Uses]","[Multiple Dwelling, Parking Garage]",,,2020,Marpole,2020-09,"{'lon': -123.1297209, 'lat': 49.2163311}",


### Data Cleaning & Preparation

In [4]:
#Check data types

permits_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 33961 entries, 0 to 33960
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   geometry                   33617 non-null  geometry
 1   permitnumber               33961 non-null  object  
 2   permitnumbercreateddate    33961 non-null  object  
 3   issuedate                  33961 non-null  object  
 4   permitelapseddays          33961 non-null  int64   
 5   projectvalue               33961 non-null  float64 
 6   typeofwork                 33961 non-null  object  
 7   address                    33896 non-null  object  
 8   projectdescription         33961 non-null  object  
 9   permitcategory             19372 non-null  object  
 10  applicant                  33961 non-null  object  
 11  applicantaddress           33875 non-null  object  
 12  propertyuse                33956 non-null  object  
 13  specificusecategory    

In [5]:
#Remove unnecessary columns

cols_to_keep = ['geometry','projectvalue','typeofwork','propertyuse','issueyear','geolocalarea']

permits_df = permits_df[cols_to_keep]

permits_df.head()

Unnamed: 0,geometry,projectvalue,typeofwork,propertyuse,issueyear,geolocalarea
0,POINT (-13703478.999 6317938.424),8470000.0,New Building,[Dwelling Uses],2017,Riley Park
1,POINT (-13705149.292 6311700.168),0.0,New Building,[Dwelling Uses],2018,Marpole
2,POINT (-13705255.123 6311745.602),0.0,New Building,[Dwelling Uses],2018,Marpole
3,POINT (-13698193.338 6315946.354),0.0,New Building,[Dwelling Uses],2020,Renfrew-Collingwood
4,POINT (-13706737.832 6311648.292),0.0,New Building,"[Dwelling Uses, Parking Uses]",2020,Marpole


In [6]:
#Rename columns to more appropriate names

new_columns = {'projectvalue':'ProjectValue',
               'typeofwork':'WorkType',
               'propertyuse':'PropertyUse',
               'issueyear':'Year',
               'geolocalarea':'Area'
               }

permits_df.rename(columns=new_columns, inplace=True)

In [7]:
permits_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 33961 entries, 0 to 33960
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   geometry      33617 non-null  geometry
 1   ProjectValue  33961 non-null  float64 
 2   WorkType      33961 non-null  object  
 3   PropertyUse   33956 non-null  object  
 4   Year          33961 non-null  object  
 5   Area          33610 non-null  object  
dtypes: float64(1), geometry(1), object(4)
memory usage: 1.6+ MB


In [8]:
#Convert year column to integer

permits_df['Year'] = permits_df['Year'].astype(str).astype(int)
permits_df.dtypes

geometry        geometry
ProjectValue     float64
WorkType          object
PropertyUse       object
Year               int32
Area              object
dtype: object

In [9]:
permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,PropertyUse,Year,Area
0,POINT (-13703478.999 6317938.424),8470000.0,New Building,[Dwelling Uses],2017,Riley Park
1,POINT (-13705149.292 6311700.168),0.0,New Building,[Dwelling Uses],2018,Marpole
2,POINT (-13705255.123 6311745.602),0.0,New Building,[Dwelling Uses],2018,Marpole
3,POINT (-13698193.338 6315946.354),0.0,New Building,[Dwelling Uses],2020,Renfrew-Collingwood
4,POINT (-13706737.832 6311648.292),0.0,New Building,"[Dwelling Uses, Parking Uses]",2020,Marpole


#### Data Cleaning & Preparation - Property Use

In [10]:
#Check value counts for property use

permits_df.PropertyUse.value_counts()

[Dwelling Uses]                                                                                     23858
[Office Uses]                                                                                        4214
[Retail Uses]                                                                                        1686
[Service Uses]                                                                                       1335
[Institutional Uses]                                                                                  668
                                                                                                    ...  
[Cultural/Recreational Uses, Parking Uses]                                                              1
[Deposition or Extraction Uses]                                                                         1
[Live-Work Uses, Manufacturing Uses, Office Uses, Service Uses, Transportation and Storage Uses]        1
[Cultural/Recreational Uses, Institutional Use

In [11]:
# Create a number of uses column by counting the number of items in the property use column which is a list of values

permits_df["NumberOfUses"] = permits_df.PropertyUse.str.len()

#Remove the rows where the number of uses is blank

permits_df = permits_df[permits_df['NumberOfUses'].notna()]

#Convert number of uses to an integer

permits_df["NumberOfUses"] = permits_df["NumberOfUses"].astype(int)

In [12]:
permits_df.NumberOfUses.value_counts()

1    32963
2      734
3      186
4       57
5       16
Name: NumberOfUses, dtype: int64

In [13]:
#Create a new revised property use column that converts the original column from a list of values into a string of values

permits_df['PropertyUseRevised'] = permits_df.PropertyUse.apply(lambda x: ', '.join([str(i) for i in x]))

permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,PropertyUse,Year,Area,NumberOfUses,PropertyUseRevised
0,POINT (-13703478.999 6317938.424),8470000.0,New Building,[Dwelling Uses],2017,Riley Park,1,Dwelling Uses
1,POINT (-13705149.292 6311700.168),0.0,New Building,[Dwelling Uses],2018,Marpole,1,Dwelling Uses
2,POINT (-13705255.123 6311745.602),0.0,New Building,[Dwelling Uses],2018,Marpole,1,Dwelling Uses
3,POINT (-13698193.338 6315946.354),0.0,New Building,[Dwelling Uses],2020,Renfrew-Collingwood,1,Dwelling Uses
4,POINT (-13706737.832 6311648.292),0.0,New Building,"[Dwelling Uses, Parking Uses]",2020,Marpole,2,"Dwelling Uses, Parking Uses"


In [14]:
#Change property use revised column base on the number of uses column. If the number of uses is greater than 1, change the value of property use revised to mixed use, otherwise keep it the same.

permits_df['PropertyUseRevised'] = np.where(permits_df.NumberOfUses > 1, 'Mixed Uses', permits_df.PropertyUseRevised)

In [15]:
permits_df.PropertyUseRevised.value_counts()

Dwelling Uses                      23858
Office Uses                         4214
Retail Uses                         1686
Service Uses                        1335
Mixed Uses                           993
Institutional Uses                   668
Cultural/Recreational Uses           541
Manufacturing Uses                   250
Wholesale Uses                       126
Transportation and Storage Uses      112
Parking Uses                          93
Utility and Communication Uses        56
Live-Work Uses                        19
Agricultural Uses                      3
Mural                                  1
Deposition or Extraction Uses          1
Name: PropertyUseRevised, dtype: int64

In [16]:
#Remove number of uses and original property use columns now that they are redundant

permits_df = permits_df[['geometry','ProjectValue','WorkType','Year','Area','PropertyUseRevised']]

permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Year,Area,PropertyUseRevised
0,POINT (-13703478.999 6317938.424),8470000.0,New Building,2017,Riley Park,Dwelling Uses
1,POINT (-13705149.292 6311700.168),0.0,New Building,2018,Marpole,Dwelling Uses
2,POINT (-13705255.123 6311745.602),0.0,New Building,2018,Marpole,Dwelling Uses
3,POINT (-13698193.338 6315946.354),0.0,New Building,2020,Renfrew-Collingwood,Dwelling Uses
4,POINT (-13706737.832 6311648.292),0.0,New Building,2020,Marpole,Mixed Uses


#### Data Cleaning & Preparation - Work Type

In [17]:
permits_df.WorkType.value_counts()

Addition / Alteration                   16475
New Building                             7745
Salvage and Abatement                    4900
Demolition / Deconstruction              4472
Temporary Building / Structure            319
Outdoor Uses (No Buildings Proposed)       45
Name: WorkType, dtype: int64

In [18]:
#Rename some of the values in the work type field

permits_df['WorkType'] = permits_df['WorkType'].replace({'Addition / Alteration':'Addition',
                                                         'Salvage and Abatement':'Salvage',
                                                         'Demolition / Deconstruction':'Demolition',
                                                         'Temporary Building / Structure':'Temporary Building',
                                                         'Outdoor Uses (No Buildings Proposed)':'Outdoor Use'
                                                         })

permits_df.WorkType.value_counts()

Addition              16475
New Building           7745
Salvage                4900
Demolition             4472
Temporary Building      319
Outdoor Use              45
Name: WorkType, dtype: int64

#### Data Cleaning & Preparation - Year

In [19]:
permits_df.Year.value_counts()

2018    6758
2017    6729
2019    5572
2022    5455
2021    5052
2020    4390
Name: Year, dtype: int64

In [20]:
#Filter the data down to permits that were submitted between the years 2017 and 2019.

permits_df = permits_df[permits_df['Year'].between(2017, 2019)]
permits_df.Year.value_counts()

2018    6758
2017    6729
2019    5572
Name: Year, dtype: int64

### Check For Missing Values

In [21]:
#Check the number of null values in each column

total_missing = permits_df.isnull().sum().sort_values(ascending=False)
percent = (permits_df.isnull().sum()/permits_df.isnull().count()).sort_values(ascending=False)*100
missing_data = pd.concat([total_missing, percent], axis=1, keys=['Total Missing', 'Percent'])
missing_data.head()

Unnamed: 0,Total Missing,Percent
Area,151,0.792277
geometry,147,0.771289
ProjectValue,0,0.0
WorkType,0,0.0
Year,0,0.0


In [22]:
#Since there is such a small amount of missing data, drop the rows where there are missing values

permits_df = permits_df.dropna()
permits_df.isnull().sum()

geometry              0
ProjectValue          0
WorkType              0
Year                  0
Area                  0
PropertyUseRevised    0
dtype: int64

# Working With Other Geospatial Datasets

In [23]:
# A function that drops all of the columns in a geopandas dataframe except for the geometry column. This is used on the datasets where you want to return a boolean value.

def geo_column_only(url, col_name):
    r = requests.get(url)
    gdf = gpd.GeoDataFrame().from_features(r.json())
    gdf = gdf.set_crs('EPSG:4326').to_crs('EPSG:3857') #Sets the CRS to the same as the permits data set
    final_table_columns = ['geometry']
    geo_df_with_cols_removed = gdf.drop(columns=[col for col in gdf if col not in final_table_columns]) #Drop a column if its not geometry
    geo_df_with_cols_removed[col_name] = 1 #Assign a value of 1 to the newly created column
    return geo_df_with_cols_removed

#Function to perform a left spatial join on two geopanadas dataframes and drop the right_index column after the join has been performed

def join_reducer(left, right):
    sjoin = gpd.sjoin(left, right, how='left')
    sjoin.drop('index_right', axis=1, inplace=True)
    return sjoin

## Business Improvement Areas

In [24]:
bia_df = geo_column_only('https://opendata.vancouver.ca/api/v2/catalog/datasets/business-improvement-areas-bia/exports/geojson?limit=-1&offset=0&timezone=UTC', 'BusinessImprovementArea')
bia_df.head()

Unnamed: 0,geometry,BusinessImprovementArea
0,"POLYGON ((-13700143.889 6321936.969, -13700128...",1
1,"MULTIPOLYGON (((-13695742.940 6322624.315, -13...",1
2,"POLYGON ((-13709782.421 6314843.080, -13709781...",1
3,"POLYGON ((-13707842.306 6311277.467, -13707919...",1
4,"POLYGON ((-13707708.568 6320234.155, -13707788...",1


In [25]:
#Spatial join BIA data to the permits data. If a permit is located within a BIA it will be labeled as 1, otherwise it will be labeled as 0.

permits_df = join_reducer(permits_df,bia_df)
permits_df['BusinessImprovementArea'].fillna(0, inplace = True) #Where a permit is not located in a BIA assign it a value of zero
permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Year,Area,PropertyUseRevised,BusinessImprovementArea
0,POINT (-13703478.999 6317938.424),8470000.0,New Building,2017,Riley Park,Dwelling Uses,0.0
1,POINT (-13705149.292 6311700.168),0.0,New Building,2018,Marpole,Dwelling Uses,0.0
2,POINT (-13705255.123 6311745.602),0.0,New Building,2018,Marpole,Dwelling Uses,0.0
6,POINT (-13699196.794 6316274.553),45000000.0,New Building,2019,Kensington-Cedar Cottage,Dwelling Uses,0.0
7,POINT (-13698581.486 6322818.274),3400000.0,New Building,2019,Hastings-Sunrise,Dwelling Uses,0.0


In [26]:
permits_df.BusinessImprovementArea.value_counts()

0.0    14762
1.0     4146
Name: BusinessImprovementArea, dtype: int64

## Flood Plain Areas

In [27]:
fp_df = geo_column_only('https://opendata.vancouver.ca/api/v2/catalog/datasets/designated-floodplain/exports/geojson?limit=-1&offset=0&timezone=UTC','FloodPlain')
fp_df.head()

Unnamed: 0,geometry,FloodPlain
0,"POLYGON ((-13695468.072 6319454.037, -13695467...",1
1,"POLYGON ((-13696427.537 6318776.760, -13696642...",1
2,"POLYGON ((-13695858.168 6319254.615, -13695858...",1
3,"POLYGON ((-13695468.072 6319454.037, -13695468...",1
4,"MULTIPOLYGON (((-13711762.162 6321091.018, -13...",1


In [28]:
permits_df = join_reducer(permits_df,fp_df)
permits_df['FloodPlain'].fillna(0, inplace = True) #Where a permit is not located in a flood plain assign it a value of zero
permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Year,Area,PropertyUseRevised,BusinessImprovementArea,FloodPlain
0,POINT (-13703478.999 6317938.424),8470000.0,New Building,2017,Riley Park,Dwelling Uses,0.0,0.0
1,POINT (-13705149.292 6311700.168),0.0,New Building,2018,Marpole,Dwelling Uses,0.0,0.0
2,POINT (-13705255.123 6311745.602),0.0,New Building,2018,Marpole,Dwelling Uses,0.0,0.0
6,POINT (-13699196.794 6316274.553),45000000.0,New Building,2019,Kensington-Cedar Cottage,Dwelling Uses,0.0,0.0
7,POINT (-13698581.486 6322818.274),3400000.0,New Building,2019,Hastings-Sunrise,Dwelling Uses,0.0,0.0


In [29]:
permits_df.FloodPlain.value_counts()

0.0    18457
1.0      451
Name: FloodPlain, dtype: int64

## Transit Stations

In [30]:
transit_url = 'https://opendata.vancouver.ca/api/v2/catalog/datasets/rapid-transit-stations/exports/geojson?limit=-1&offset=0&timezone=UTC'
transit_data = requests.get(transit_url)
transit_df = gpd.GeoDataFrame().from_features(transit_data.json())
transit_df = transit_df.set_crs('EPSG:4326').to_crs('EPSG:3857')
transit_df.head()

Unnamed: 0,geometry,station,geo_local_area
0,POINT (-13705329.627 6310492.370),MARINE DRIVE,Marpole
1,POINT (-13701087.405 6320080.163),VCC - CLARK,Strathcona
2,POINT (-13698516.956 6317093.472),NANAIMO,Renfrew-Collingwood
3,POINT (-13703496.877 6321341.731),MAIN ST. - SCIENCE WORLD,Downtown
4,POINT (-13704494.065 6322410.505),STADIUM - CHINATOWN,Downtown


In [31]:
#Perform a nearest spatial join to calculate the distance between each permit and the nearest transit station in meters
permits_df = permits_df.sjoin_nearest(transit_df, how = 'left', distance_col='DistanceToNearestTransitStation')

#Remove unnecessary columns after calculating distance to nearest transit station via spatial join
cols_to_keep_final = ['geometry','ProjectValue','WorkType','Area','PropertyUseRevised','BusinessImprovementArea','FloodPlain','DistanceToNearestTransitStation']
permits_df = permits_df[cols_to_keep_final]

permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Area,PropertyUseRevised,BusinessImprovementArea,FloodPlain,DistanceToNearestTransitStation
0,POINT (-13703478.999 6317938.424),8470000.0,New Building,Riley Park,Dwelling Uses,0.0,0.0,1799.230623
1,POINT (-13705149.292 6311700.168),0.0,New Building,Marpole,Dwelling Uses,0.0,0.0,1221.186297
2,POINT (-13705255.123 6311745.602),0.0,New Building,Marpole,Dwelling Uses,0.0,0.0,1255.44484
6,POINT (-13699196.794 6316274.553),45000000.0,New Building,Kensington-Cedar Cottage,Dwelling Uses,0.0,0.0,1064.333954
7,POINT (-13698581.486 6322818.274),3400000.0,New Building,Hastings-Sunrise,Dwelling Uses,0.0,0.0,3489.680863


## Exploratory Data Analysis