https://opendata.vancouver.ca/explore/?disjunctive.features&disjunctive.theme&disjunctive.keyword&disjunctive.data-owner&disjunctive.data-team&sort=modified

https://opendata.vancouver.ca/api/v2/console

In [None]:
#Import packages

import numpy as np
import pandas as pd
import geopandas as gpd
import requests
import seaborn as sns
import matplotlib.pyplot as plt

## Working With Building Permits Data

In [None]:
#CRS = ('EPSG:3857')

url = 'https://opendata.vancouver.ca/api/v2/catalog/datasets/issued-building-permits/exports/geojson?limit=-1&offset=0&timezone=UTC'
data = requests.get(url)
permits_df = gpd.GeoDataFrame().from_features(data.json())
permits_df = permits_df.set_crs('EPSG:4326').to_crs('EPSG:3857')

In [None]:
permits_df.head()

Unnamed: 0,geometry,permitnumber,permitnumbercreateddate,issuedate,permitelapseddays,projectvalue,typeofwork,address,projectdescription,permitcategory,...,applicantaddress,propertyuse,specificusecategory,buildingcontractor,buildingcontractoraddress,issueyear,geolocalarea,yearmonth,geo_point_2d,geom
0,POINT (-13706263.533 6316085.421),BP-2021-05126,2021-09-29,2022-08-12,317,7977669.0,New Building,"918 W 32ND AVENUE, Vancouver, BC",Certified Professional Program - New Building ...,,...,"470 East 10th Avenue\r\nVancouver, BC V5T 2A1",[Dwelling Uses],[Multiple Dwelling],LOUISAN PACIFIC DEVELOPMENTS INC,,2022,South Cambie,2022-08,"{'lon': -123.1254602, 'lat': 49.2423606}",
1,POINT (-13706059.228 6318634.168),BP-2017-05215,2017-10-04,2017-11-03,30,24050.0,Addition / Alteration,"863 W 16TH AVENUE #302, Vancouver, BC V5Z 1S9",Field Review - Addition / Alteration - #302\r\...,Renovation - Residential - Lower Complexity,...,"9049 161A street\r\nSurrey, BC V4N 3E5",[Dwelling Uses],[Multiple Dwelling],Fine Living Construction Corp,,2017,Fairview,2017-11,"{'lon': -123.1236249, 'lat': 49.2573061}",
2,POINT (-13706859.515 6323887.222),BP-2017-05020,2017-09-25,2018-02-21,149,11020000.0,Demolition / Deconstruction,"1400 ROBSON STREET, Vancouver, BC V6G 1B9",Enquiry Centre - Demolition / Deconstruction -...,,...,"1555-200 Burrard St\r\nVancouver, BC V6C 3L6",[Service Uses],[Hotel],JMX Contracting Inc,"997 SEYMOUR ST \r\nUnit 250\r\nVancouver, BC ...",2018,West End,2018-02,"{'lon': -123.130814, 'lat': 49.288095}",
3,POINT (-13706771.150 6323802.144),BP-2018-01702,2018-03-28,2018-06-27,91,321000.0,Demolition / Deconstruction,"1394 ROBSON STREET, Vancouver, BC V6E 1C5",Enquiry Centre - Demolition / Deconstruction -...,,...,"769 Roslyn Blvd\r\nNorth Vancouver, BC V7G 1G7",[Service Uses],[Hotel],JMX Contracting Inc,"997 SEYMOUR ST \r\nUnit 250\r\nVancouver, BC ...",2018,West End,2018-06,"{'lon': -123.1300202, 'lat': 49.2875965}",
4,POINT (-13695628.548 6313160.040),DB-2016-04585,2016-12-22,2017-10-06,288,229030.0,New Building,"3482 E 48TH AVENUE, Vancouver, BC V5S 1H7",Low Density Housing - New Building - To constr...,New Build - Low Density Housing,...,"14098 75A Ave \r\nSurrey, BC V3W6T5",[Dwelling Uses],[Laneway House],SCH Samby Custom Homes Ltd,,2017,Killarney,2017-10,"{'lon': -123.0299245, 'lat': 49.225201}",


### Data Cleaning & Preparation

In [None]:
#Check data types

permits_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 33941 entries, 0 to 33940
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   geometry                   33597 non-null  geometry
 1   permitnumber               33941 non-null  object  
 2   permitnumbercreateddate    33941 non-null  object  
 3   issuedate                  33941 non-null  object  
 4   permitelapseddays          33941 non-null  int64   
 5   projectvalue               33941 non-null  float64 
 6   typeofwork                 33941 non-null  object  
 7   address                    33876 non-null  object  
 8   projectdescription         33941 non-null  object  
 9   permitcategory             19367 non-null  object  
 10  applicant                  33941 non-null  object  
 11  applicantaddress           33855 non-null  object  
 12  propertyuse                33936 non-null  object  
 13  specificusecategory    

In [None]:
#Remove unnecessary columns

cols_to_keep = ['geometry','projectvalue','typeofwork','propertyuse','issueyear','geolocalarea']

permits_df = permits_df[cols_to_keep]

permits_df.head()

Unnamed: 0,geometry,projectvalue,typeofwork,propertyuse,issueyear,geolocalarea
0,POINT (-13706263.533 6316085.421),7977669.0,New Building,[Dwelling Uses],2022,South Cambie
1,POINT (-13706059.228 6318634.168),24050.0,Addition / Alteration,[Dwelling Uses],2017,Fairview
2,POINT (-13706859.515 6323887.222),11020000.0,Demolition / Deconstruction,[Service Uses],2018,West End
3,POINT (-13706771.150 6323802.144),321000.0,Demolition / Deconstruction,[Service Uses],2018,West End
4,POINT (-13695628.548 6313160.040),229030.0,New Building,[Dwelling Uses],2017,Killarney


In [None]:
#Rename columns

new_columns = {'projectvalue':'ProjectValue',
               'typeofwork':'WorkType',
               'propertyuse':'PropertyUse',
               'issueyear':'Year',
               'geolocalarea':'Area'
               }

permits_df.rename(columns=new_columns, inplace=True)

In [None]:
permits_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 33941 entries, 0 to 33940
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   geometry      33597 non-null  geometry
 1   ProjectValue  33941 non-null  float64 
 2   WorkType      33941 non-null  object  
 3   PropertyUse   33936 non-null  object  
 4   Year          33941 non-null  object  
 5   Area          33590 non-null  object  
dtypes: float64(1), geometry(1), object(4)
memory usage: 1.6+ MB


In [None]:
#Convert year to integer

permits_df['Year'] = permits_df['Year'].astype(str).astype(int)
permits_df.dtypes

geometry        geometry
ProjectValue     float64
WorkType          object
PropertyUse       object
Year               int32
Area              object
dtype: object

In [None]:
permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,PropertyUse,Year,Area
0,POINT (-13706263.533 6316085.421),7977669.0,New Building,[Dwelling Uses],2022,South Cambie
1,POINT (-13706059.228 6318634.168),24050.0,Addition / Alteration,[Dwelling Uses],2017,Fairview
2,POINT (-13706859.515 6323887.222),11020000.0,Demolition / Deconstruction,[Service Uses],2018,West End
3,POINT (-13706771.150 6323802.144),321000.0,Demolition / Deconstruction,[Service Uses],2018,West End
4,POINT (-13695628.548 6313160.040),229030.0,New Building,[Dwelling Uses],2017,Killarney


#### Data Cleaning & Preparation - Property Use

In [None]:
#Check value counts for property use

permits_df.PropertyUse.value_counts()

[Dwelling Uses]                                                                 23841
[Office Uses]                                                                    4214
[Retail Uses]                                                                    1685
[Service Uses]                                                                   1334
[Institutional Uses]                                                              668
                                                                                ...  
[Institutional Uses, Office Uses, Parking Uses, Retail Uses]                        1
[Cultural/Recreational Uses, Dwelling Uses, Institutional Uses, Retail Uses]        1
[Cultural/Recreational Uses, Office Uses, Parking Uses, Service Uses]               1
[Office Uses, Utility and Communication Uses]                                       1
[Manufacturing Uses, Office Uses, Retail Uses]                                      1
Name: PropertyUse, Length: 130, dtype: int64

In [None]:
# Create a number of uses column

permits_df["NumberOfUses"] = permits_df.PropertyUse.str.len()

#Remove rows where the number of use is blank

permits_df = permits_df[permits_df['NumberOfUses'].notna()]

#Convert number of uses to integer

permits_df["NumberOfUses"] = permits_df["NumberOfUses"].astype(int)

In [None]:
permits_df.NumberOfUses.value_counts()

1    32944
2      733
3      186
4       57
5       16
Name: NumberOfUses, dtype: int64

In [None]:
#Create a new revised property use column that converts the original column from a list of values into a string of values

permits_df['PropertyUseRevised'] = permits_df.PropertyUse.apply(lambda x: ', '.join([str(i) for i in x]))

permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,PropertyUse,Year,Area,NumberOfUses,PropertyUseRevised
0,POINT (-13706263.533 6316085.421),7977669.0,New Building,[Dwelling Uses],2022,South Cambie,1,Dwelling Uses
1,POINT (-13706059.228 6318634.168),24050.0,Addition / Alteration,[Dwelling Uses],2017,Fairview,1,Dwelling Uses
2,POINT (-13706859.515 6323887.222),11020000.0,Demolition / Deconstruction,[Service Uses],2018,West End,1,Service Uses
3,POINT (-13706771.150 6323802.144),321000.0,Demolition / Deconstruction,[Service Uses],2018,West End,1,Service Uses
4,POINT (-13695628.548 6313160.040),229030.0,New Building,[Dwelling Uses],2017,Killarney,1,Dwelling Uses


In [None]:
#Change property use revised column base on the number of uses column. If the number of uses is greater than 1, change the value of property use revised to mixed use, otherwise keep it the same.

permits_df['PropertyUseRevised'] = np.where(permits_df.NumberOfUses > 1, 'Mixed Uses', permits_df.PropertyUseRevised)

In [None]:
permits_df.PropertyUseRevised.value_counts()

Dwelling Uses                      23841
Office Uses                         4214
Retail Uses                         1685
Service Uses                        1334
Mixed Uses                           992
Institutional Uses                   668
Cultural/Recreational Uses           541
Manufacturing Uses                   250
Wholesale Uses                       126
Transportation and Storage Uses      112
Parking Uses                          93
Utility and Communication Uses        56
Live-Work Uses                        19
Agricultural Uses                      3
Mural                                  1
Deposition or Extraction Uses          1
Name: PropertyUseRevised, dtype: int64

In [None]:
#Remove number of uses and original property use columns now that they are redundant

permits_df = permits_df[['geometry','ProjectValue','WorkType','Year','Area','PropertyUseRevised']]

permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Year,Area,PropertyUseRevised
0,POINT (-13706263.533 6316085.421),7977669.0,New Building,2022,South Cambie,Dwelling Uses
1,POINT (-13706059.228 6318634.168),24050.0,Addition / Alteration,2017,Fairview,Dwelling Uses
2,POINT (-13706859.515 6323887.222),11020000.0,Demolition / Deconstruction,2018,West End,Service Uses
3,POINT (-13706771.150 6323802.144),321000.0,Demolition / Deconstruction,2018,West End,Service Uses
4,POINT (-13695628.548 6313160.040),229030.0,New Building,2017,Killarney,Dwelling Uses


#### Data Cleaning & Preparation - Work Type

In [None]:
permits_df.WorkType.value_counts()

Addition / Alteration                   16471
New Building                             7739
Salvage and Abatement                    4894
Demolition / Deconstruction              4469
Temporary Building / Structure            318
Outdoor Uses (No Buildings Proposed)       45
Name: WorkType, dtype: int64

In [None]:
#Rename some of the values in the work type field

permits_df['WorkType'] = permits_df['WorkType'].replace({'Addition / Alteration':'Addition',
                                                         'Salvage and Abatement':'Salvage',
                                                         'Demolition / Deconstruction':'Demolition',
                                                         'Temporary Building / Structure':'Temporary Building',
                                                         'Outdoor Uses (No Buildings Proposed)':'Outdoor Use'
                                                         })

permits_df.WorkType.value_counts()

Addition              16471
New Building           7739
Salvage                4894
Demolition             4469
Temporary Building      318
Outdoor Use              45
Name: WorkType, dtype: int64

#### Data Cleaning & Preparation - Year

In [None]:
permits_df.Year.value_counts()

2018    6758
2017    6729
2019    5572
2022    5435
2021    5052
2020    4390
Name: Year, dtype: int64

In [None]:
#Filter the data down to permits that were submitted between the years 2017 and 2019

permits_df = permits_df[permits_df['Year'].between(2017, 2019)]
permits_df.Year.value_counts()

2018    6758
2017    6729
2019    5572
Name: Year, dtype: int64

### Check For Missing Values

In [None]:
#Check the number of null values in the dataframe

total_missing = permits_df.isnull().sum().sort_values(ascending=False)
percent = (permits_df.isnull().sum()/permits_df.isnull().count()).sort_values(ascending=False)*100
missing_data = pd.concat([total_missing, percent], axis=1, keys=['Total Missing', 'Percent'])
missing_data.head()

Unnamed: 0,Total Missing,Percent
Area,151,0.792277
geometry,147,0.771289
ProjectValue,0,0.0
WorkType,0,0.0
Year,0,0.0


In [None]:
#Since there is such a small amount of missing data drop the rows where there are missing values

permits_df = permits_df.dropna()
permits_df.isnull().sum()

geometry              0
ProjectValue          0
WorkType              0
Year                  0
Area                  0
PropertyUseRevised    0
dtype: int64

# Working With Other Geospatial Datasets

In [None]:
# A function that drops all of the columns in a geopandas dataframe except for the geometry column. This is used on the datasets where you want to return a boolean value.

def geo_column_only(url, col_name):
    r = requests.get(url)
    gdf = gpd.GeoDataFrame().from_features(r.json())
    gdf = gdf.set_crs('EPSG:4326').to_crs('EPSG:3857')
    final_table_columns = ['geometry']
    geo_df_with_cols_removed = gdf.drop(columns=[col for col in gdf if col not in final_table_columns])
    geo_df_with_cols_removed[col_name] = 1
    return geo_df_with_cols_removed

#Function to perform a left spatial join on two geodataframes and drop the right_index column after the join has been performed

def join_reducer(left, right):
    sjoin = gpd.sjoin(left, right, how='left')
    sjoin.drop('index_right', axis=1, inplace=True)
    return sjoin

## Business Improvement Areas

In [None]:
bia_df = geo_column_only('https://opendata.vancouver.ca/api/v2/catalog/datasets/business-improvement-areas-bia/exports/geojson?limit=-1&offset=0&timezone=UTC', 'BusinessImprovementArea')
bia_df.head()

Unnamed: 0,geometry,Business_Improvement_Area
0,"POLYGON ((-13700143.889 6321936.969, -13700128...",1
1,"MULTIPOLYGON (((-13695742.940 6322624.315, -13...",1
2,"POLYGON ((-13709782.421 6314843.080, -13709781...",1
3,"POLYGON ((-13707842.306 6311277.467, -13707919...",1
4,"POLYGON ((-13707708.568 6320234.155, -13707788...",1


In [None]:
#Spatial join BIA data to the permits data. If a permit is located within a BIA it will be labeled as 1, otherwise it will be labeled as 0.

permits_df = join_reducer(permits_df,bia_df)
permits_df['BusinessImprovementArea'].fillna(0, inplace = True) #Where a permit is not located in a BIA assign it a value of zero
permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Year,Area,PropertyUseRevised,Business_Improvement_Area
1,POINT (-13706059.228 6318634.168),24050.0,Addition,2017,Fairview,Dwelling Uses,0.0
2,POINT (-13706859.515 6323887.222),11020000.0,Demolition,2018,West End,Service Uses,1.0
3,POINT (-13706771.150 6323802.144),321000.0,Demolition,2018,West End,Service Uses,1.0
4,POINT (-13695628.548 6313160.040),229030.0,New Building,2017,Killarney,Dwelling Uses,0.0
5,POINT (-13695639.680 6313113.166),1099810.0,New Building,2017,Killarney,Dwelling Uses,0.0


In [None]:
permits_df.BusinessImprovementArea.value_counts()

0.0    14762
1.0     4146
Name: Business_Improvement_Area, dtype: int64

## Flood Plain Areas

In [None]:
fp_df = geo_column_only('https://opendata.vancouver.ca/api/v2/catalog/datasets/designated-floodplain/exports/geojson?limit=-1&offset=0&timezone=UTC','FloodPlain')
fp_df.head()

Unnamed: 0,geometry,Flood_Plain
0,"POLYGON ((-13695468.072 6319454.037, -13695467...",1
1,"POLYGON ((-13696427.537 6318776.760, -13696642...",1
2,"POLYGON ((-13695858.168 6319254.615, -13695858...",1
3,"POLYGON ((-13695468.072 6319454.037, -13695468...",1
4,"MULTIPOLYGON (((-13711762.162 6321091.018, -13...",1


In [None]:
permits_df = join_reducer(permits_df,fp_df)
permits_df['FloodPlain'].fillna(0, inplace = True) #Where a permit is not located in a flood plain assign it a value of zero
permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Year,Area,PropertyUseRevised,Business_Improvement_Area,Flood_Plain
1,POINT (-13706059.228 6318634.168),24050.0,Addition,2017,Fairview,Dwelling Uses,0.0,0.0
2,POINT (-13706859.515 6323887.222),11020000.0,Demolition,2018,West End,Service Uses,1.0,0.0
3,POINT (-13706771.150 6323802.144),321000.0,Demolition,2018,West End,Service Uses,1.0,0.0
4,POINT (-13695628.548 6313160.040),229030.0,New Building,2017,Killarney,Dwelling Uses,0.0,0.0
5,POINT (-13695639.680 6313113.166),1099810.0,New Building,2017,Killarney,Dwelling Uses,0.0,0.0


In [None]:
permits_df.FloodPlain.value_counts()

0.0    18457
1.0      451
Name: Flood_Plain, dtype: int64

## Transit Stations

In [None]:
transit_url = 'https://opendata.vancouver.ca/api/v2/catalog/datasets/rapid-transit-stations/exports/geojson?limit=-1&offset=0&timezone=UTC'
transit_data = requests.get(transit_url)
transit_df = gpd.GeoDataFrame().from_features(transit_data.json())
transit_df = transit_df.set_crs('EPSG:4326').to_crs('EPSG:3857')
transit_df.head()

Unnamed: 0,geometry,station,geo_local_area
0,POINT (-13705329.627 6310492.370),MARINE DRIVE,Marpole
1,POINT (-13701087.405 6320080.163),VCC - CLARK,Strathcona
2,POINT (-13698516.956 6317093.472),NANAIMO,Renfrew-Collingwood
3,POINT (-13703496.877 6321341.731),MAIN ST. - SCIENCE WORLD,Downtown
4,POINT (-13704494.065 6322410.505),STADIUM - CHINATOWN,Downtown


In [None]:
#Perform a nearest spatial join to calculate the distance between each permit and the nearest transit station in meters
permits_df = permits_df.sjoin_nearest(transit_df, how = 'left', distance_col='DistanceToNearestTransitStation')

#Remove unnecessary columns after calculating distance to nearest transit station via spatial join
cols_to_keep_final = ['geometry','ProjectValue','WorkType','Area','PropertyUseRevised','BusinessImprovementArea','FloodPlain','DistanceToNearestTransitStation']
permits_df = permits_df[cols_to_keep_final]

permits_df.head()

Unnamed: 0,geometry,ProjectValue,WorkType,Year,Area,PropertyUseRevised,Business_Improvement_Area,Flood_Plain,index_right,station,geo_local_area,DistanceToNearestTransitStation
1,POINT (-13706059.228 6318634.168),24050.0,Addition,2017,Fairview,Dwelling Uses,0.0,0.0,9,BROADWAY-\nCITY HALL,Mount Pleasant,1355.978344
2,POINT (-13706859.515 6323887.222),11020000.0,Demolition,2018,West End,Service Uses,1.0,0.0,6,BURRARD,Downtown,1265.722394
3,POINT (-13706771.150 6323802.144),321000.0,Demolition,2018,West End,Service Uses,1.0,0.0,6,BURRARD,Downtown,1157.111254
4,POINT (-13695628.548 6313160.040),229030.0,New Building,2017,Killarney,Dwelling Uses,0.0,0.0,11,JOYCE - COLLINGWOOD,Renfrew-Collingwood,2258.767424
5,POINT (-13695639.680 6313113.166),1099810.0,New Building,2017,Killarney,Dwelling Uses,0.0,0.0,11,JOYCE - COLLINGWOOD,Renfrew-Collingwood,2304.458290
...,...,...,...,...,...,...,...,...,...,...,...,...
33930,POINT (-13705756.874 6323046.193),15000.0,Temporary Building,2019,Downtown,Cultural/Recreational Uses,1.0,0.0,20,VANCOUVER\nCITY\nCENTRE,Downtown,304.045153
33932,POINT (-13700989.494 6316830.874),3000.0,Addition,2019,Kensington-Cedar Cottage,Dwelling Uses,0.0,0.0,2,NANAIMO,Renfrew-Collingwood,2486.443242
33933,POINT (-13705292.304 6323172.151),150000.0,Addition,2019,Downtown,Retail Uses,1.0,0.0,5,GRANVILLE,Downtown,58.673277
33935,POINT (-13705292.304 6323172.151),100000.0,Addition,2019,Downtown,Retail Uses,1.0,0.0,5,GRANVILLE,Downtown,58.673277
