In [38]:
import geopandas as gpd
import pandas as pd
from datetime import datetime
import json
import os

import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import folium

In [2]:
# Prevent warning messages from openpyxl
# Courtesy of https://stackoverflow.com/a/66015981/4361039

import warnings
warnings.simplefilter("ignore")

In [165]:
# Function to read census tract boundaries geopackage file
# Returns a geodataframe
def read_geo(year):
    geo = gpd.read_file(
        f'./census-tracts/{year}-toronto-ct.gpkg'
    ).to_crs('EPSG:4326')
    
    # Identify column with census tract names
    ct_col_name = [ col for col in geo.columns if str(col).lower()[:2] == 'ct'][0]
    if year == 2011:
        ct_col_name = 'CTNAME'
    
    # Only return two columns: ct, geometry
    geo = geo.filter([ct_col_name, 'geometry'])
    geo.columns = ['ct', 'geometry']
    
    # Make sure census tract names are strings, not ints
    geo.ct = geo.ct.apply(lambda x: "{:06.2f}".format(float(x)))
    
    return geo


# Function to read a single year of Census data from Excel
# Returns a dataframe
def read_data(year):
    
    dfs = pd.read_excel(
        f'./census-data/QC Completed {year}.xlsx',
        sheet_name=None,        # read all available sheets
        na_values=['no data']
    )
    
    # Sheets that contain census tract data are named "CT Tab..."
    data_tab_names = [ tab for tab in dfs if tab.startswith('CT Tab') ]
    
    print(f"Census {year}: reading data from {', '.join(data_tab_names)}")
    
    # Combine data across tabs into a single df
    data = pd.concat( [ dfs[tab] for tab in data_tab_names ] )
    
    # Lowercase column names
    data.columns = [ str(x).lower() for x in data.columns ]
    data = data.rename(columns={
        'ct name': 'ct'
    })
    
    # Make sure census tracts are strings, not ints
    data.ct = data.ct.apply(lambda x: "{:06.2f}".format(float(x)))
    
    return data


# Reads a single year of census
# Returns a geodataframe
def read_census(year):
    
    data = read_data(year)
    geo = read_geo(year)
    
    return geo.merge(data, how='right', on='ct')

## Read all Census years

In [189]:
c1951 = read_census(1951)
c1961 = read_census(1961)
c1971 = read_census(1971)
c1981 = read_census(1981)
c1991 = read_census(1991)
c2001 = read_census(2001)
c2011 = read_census(2011)

Census 1951: reading data from CT Tab-Etobicoke Twshp, CT Tab-Long Branch, CT Tab-New Toronto, CT Tab-Mimico
Census 1961: reading data from CT Tab - Etobicoke Twshp, CT Tab - Long Branch, CT Tab - New Toronto, CT Tab - Mimico
Census 1971: reading data from CT Tab - Borough of Etobicoke
Census 1981: reading data from CT Tab - Borough of Etobicoke
Census 1991: reading data from CT Tab - City of Etobicoke
Census 2001: reading data from CT Tab - F-Etobicoke
Census 2011: reading data from CT Tabs F-Etobicoke


## Clean up and derive extra variables

In [None]:
# Create a new column with % value for a given numerator
# and denominator, & custom suffix
# Place it after straight after the existing variable
def calc_perc(df, num, denom, suffix='%'):
    col_idx = df.columns.get_loc(num)
    try:
        df.insert(
            col_idx + 1,
            f"{num} (%)",
            (df[num] / df[denom] * 100).apply(lambda x: round(x, 1))
        )
    except:
        print(f"Could not create a % variable for {num}")

In [191]:
# Read 1951 census
c1951 = read_census(1951)
    
# Calculate %s for origin
for x in [
    '22 british isles origine',
    '23 french',
    '24 german',
    '25 italian',
    '26 jewish',
    '27 netherlands',
    '28 polish',
    '29 russian',
    '30 scandinavian',
    '31 ukraininian',
    '32 other european',
    '33 asiatic',
    '34 other and not stated',
    '46 roman catholic'
]:
    calc_perc(c1951, x, '1 population, 1951')

    
# Calculate %s for dwellings
for x in [
    '12 [households] with lodgers',
    '29 [occupied dwellings] single detached',
    '30 [occupied dwellings] apartments and flats',
    '33 [occupied dwellings] owner occupied',
    '35 [occupied dwellings] tenant-occupied',
    '37 [year of occupancy:] before 1946',
    '38 [year of occupancy:] 1946-1949', '39 [year of occupancy:] 1950-51',
    '50 [occupied dwellings] passenger automobile'
]:
    calc_perc(c1951, x, '1 occupied dwellings (households)')

# Mortgage needs a different denominator    
calc_perc(
    c1951,
    '34 [occupied dwellings, owner occupied] reporting a mortgage',
    '33 [occupied dwellings] owner occupied',
    suffix='(% own. occup.)'
)

Census 1951: reading data from CT Tab-Etobicoke Twshp, CT Tab-Long Branch, CT Tab-New Toronto, CT Tab-Mimico


## Save map data

In [192]:
# Dictionary of census years and data frames
dfs = { 1951: c1951, 1961: c1961, 1971: c1971, 1981: c1981, 1991: c1991, 2001: c2001, 2011: c2011 }

# Generate a JSON object with year->variable->min/max values for choropleth
data = {
    str(y) : {
        str(v): {
            'min': str(dfs[y].loc[:, v].min()),
            'max': str(dfs[y].loc[:, v].max())
        } for v in dfs[y].columns[2:]
    } for y in dfs.keys()
}

os.system(f"echo 'const metadata = {json.dumps(data)}' > ./map/metadata.js")

for y, df in zip(dfs.keys(), dfs.values()):
    df.to_file(f'./map/geojson/{y}.geojson', driver='GeoJSON')

## Generate static maps

In [6]:
#fields = list(c1951.columns[2:])

#for field in fields:
#    fig = generate_map(c1951, field, 1951)
#    fig.savefig(f'output/1951-{field}.pdf', dpi=300)