In [1]:
import os 
import duckdb
import pandas as pd
import numpy as np

In [2]:
con = duckdb.connect(database=':memory:', read_only=False)
con.sql("INSTALL spatial;")
con.sql("INSTALL json;")

con.sql("LOAD spatial;")
con.sql("LOAD json;")

In [3]:
# create schema and set defaults 
con.sql("CREATE SCHEMA analytics;")

In [5]:
# create the neighborhood table

neighborhood_table = """
CREATE OR REPLACE TABLE analytics.neighborhood_geo_data AS
WITH raw_data AS (
    -- Reads the GeoJSON file and converts it into a nested table structure
    SELECT * FROM read_json_auto('source-files/Neighbourhoods.geojson')
),
unnested_features AS (
    -- Flattens the 'features' array into individual rows
    SELECT unnest(raw_data.features) AS feature_struct FROM raw_data
)
SELECT
    -- 1. typedefs 
    feature_struct.properties._id,
    feature_struct.properties.AREA_ID::BIGINT AS AREA_ID,
    feature_struct.properties.AREA_ATTR_ID::BIGINT AS AREA_ATTR_ID,
    feature_struct.properties.PARENT_AREA_ID,
    feature_struct.properties.AREA_SHORT_CODE::VARCHAR AS AREA_SHORT_CODE,
    feature_struct.properties.AREA_LONG_CODE::VARCHAR AS AREA_LONG_CODE,
    feature_struct.properties.AREA_NAME::VARCHAR AS AREA_NAME,
    feature_struct.properties.AREA_DESC,
    feature_struct.properties.CLASSIFICATION::VARCHAR AS CLASSIFICATION,
    feature_struct.properties.CLASSIFICATION_CODE::VARCHAR AS CLASSIFICATION_CODE,
    feature_struct.properties.OBJECTID::DOUBLE AS OBJECTID,
    
    -- 2. Convert the GeoJSON geometry into a WKT string
    --    FIX: Use to_json() to convert the STRUCT to a valid JSON string
    ST_AsText(ST_GeomFromGeoJSON(to_json(feature_struct.geometry))) AS geometry_wkt,
    
    -- 3. Keep the geometry type for reference (simplified access)
    feature_struct.geometry.type AS geometry_type
FROM unnested_features;
"""

con.sql(neighborhood_table)

In [6]:
# Check the table structure and data types
con.sql("PRAGMA table_info('analytics.neighborhood_geo_data')").show()

# Alternatively:
# con.sql("DESCRIBE analytics.neighborhood_geo_data").show()

┌───────┬─────────────────────┬─────────┬─────────┬────────────┬─────────┐
│  cid  │        name         │  type   │ notnull │ dflt_value │   pk    │
│ int32 │       varchar       │ varchar │ boolean │  varchar   │ boolean │
├───────┼─────────────────────┼─────────┼─────────┼────────────┼─────────┤
│     0 │ _id                 │ BIGINT  │ false   │ NULL       │ false   │
│     1 │ AREA_ID             │ BIGINT  │ false   │ NULL       │ false   │
│     2 │ AREA_ATTR_ID        │ BIGINT  │ false   │ NULL       │ false   │
│     3 │ PARENT_AREA_ID      │ JSON    │ false   │ NULL       │ false   │
│     4 │ AREA_SHORT_CODE     │ VARCHAR │ false   │ NULL       │ false   │
│     5 │ AREA_LONG_CODE      │ VARCHAR │ false   │ NULL       │ false   │
│     6 │ AREA_NAME           │ VARCHAR │ false   │ NULL       │ false   │
│     7 │ AREA_DESC           │ VARCHAR │ false   │ NULL       │ false   │
│     8 │ CLASSIFICATION      │ VARCHAR │ false   │ NULL       │ false   │
│     9 │ CLASSIFICATION_

In [7]:
# View the first 5 rows and focus on the WKT
con.sql("""
    SELECT 
        AREA_NAME, 
        geometry_type, 
        LEFT(geometry_wkt, 60) AS geometry_wkt_sample, 
        LENGTH(geometry_wkt) AS wkt_length
    FROM analytics.neighborhood_geo_data 
    LIMIT 5
""").show()

┌───────────────────────────┬───────────────┬──────────────────────────────────────────────────────────────┬────────────┐
│         AREA_NAME         │ geometry_type │                     geometry_wkt_sample                      │ wkt_length │
│          varchar          │    varchar    │                           varchar                            │   int64    │
├───────────────────────────┼───────────────┼──────────────────────────────────────────────────────────────┼────────────┤
│ South Eglinton-Davisville │ MultiPolygon  │ MULTIPOLYGON (((-79.3863510515018 43.6978312650188, -79.3862 │       1949 │
│ North Toronto             │ MultiPolygon  │ MULTIPOLYGON (((-79.3974366551459 43.7069299169967, -79.3983 │       1018 │
│ Dovercourt Village        │ MultiPolygon  │ MULTIPOLYGON (((-79.4341131654386 43.660145276359, -79.43536 │       2766 │
│ Junction-Wallace Emerson  │ MultiPolygon  │ MULTIPOLYGON (((-79.4387000029275 43.6676608052133, -79.4384 │       6104 │
│ Yonge-Bay Corridor    

In [8]:
con.sql("SELECT COUNT(*) FROM analytics.neighborhood_geo_data").show() # should be 158 rows 

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│          158 │
└──────────────┘



In [9]:
# make the PK using neighborhood lists 

# Trinity-Bellwoods, West Queen West Area (West-Central Core)
tor_c01 = ['Dufferin Grove', 'Palmerston-Little Italy', 'Trinity-Bellwoods', 'University', 
           'Kensington-Chinatown', 
           'West Queen West'] # **ADDED**
# NOTE: Wellington Place is moved to c08 for closer fit with core/waterfront

tor_c02 = ['Wychwood', 'Casa Loma', 'Annex', 'Yonge-St.Clair']
tor_c03 = ['Humewood-Cedarvale', 'Forest Hill South', 'Yonge-Eglinton']

# Uptown (Lawrence Park Area)
tor_c04 = ['Bedford Park-Nortown', 'Lawrence Park North', 'Lawrence Park South', 
           'Englemount-Lawrence', 'Forest Hill North',
           'North Toronto'] # **ADDED**
           
tor_c06 = ['Bathurst Manor', 'Clanton Park']
tor_c07 = ['Newtonbrook West', 'Newtoonbrook East', 'Willowdale West', 'Lansing-Westgate']

# Downtown Core, Waterfront, East Core (The main hub)
tor_c08 = ['North St.James Town', 'Cabbagetown-South St.James Town', 'Regent Park', 
           'Moss Park', 'Yonge-Bay Corridor', 
           'Bay-Cloverhill', 'Downtown Yonge East', 'Church-Wellesley', 
           'St Lawrence-East Bayfront-The Islands', 'Harbourfront-CityPlace', 
           'Wellington Place'] # **ADDED** (All Core/Waterfront)

# Rosedale Area
tor_c09 = ['Rosedale-Moore Park', 
           'Mount Pleasant East'] # **ADDED**

# Midtown North (Yonge/Eglinton Area)
tor_c10 = ['Mount Pleasant West', 
           'South Eglinton-Davisville'] # **ADDED**
           
tor_c11 = ['Flemingdon Park', 'Thorncliffe Park', 'Leaside-Bennington']
tor_c12 = ['St.Andrew-Windfields', 'Bridle Path-Sunnybrook-York Mills']
tor_c13 = ["Parkwoods-O'Connor Hills", "Banbury-Don Mills", "Victoria Village", "Fenside-Parkwoods"]

# North York Yonge Corridor (Closest to Avondale/Yonge-Doris)
tor_c14 = ['NewtonBrook East', 'East Willowdale', 
           'Avondale', 'Yonge-Doris'] # **ADDED**
           
tor_c15 = ['Bayview Village', 'Don Valley Village', 'Hillcrest Village', 'Bayview Woods-Steeles', 'Pleasant View', 'Henry Farm']
# missing islands and waterfront community 

# Far East Scarborough (Rouge/Malvern Area)
tor_e11 = ['West Rouge', 'Malvern East', 'Malvern West',
           'Morningside Heights'] # **ADDED**
           
tor_e10 = ['Highland Creek', 'Centennial Scarborough', 'West Hill']
tor_e09 = ['Morningside', 'Woburn North', 'Golfdale-Cedarbrae-Woburn', 'Bendale South', 'Bendale-Glen Andrew']
tor_e08 = ['Guildwood', 'Scarborough Village', 'Cliffcrest', 'Eglinton East']
tor_e07 = ['Milliken', 'Agincourt North', 'Agincourt South-Malvern West']

# Scarborough/North York East (L'Amoreaux Area)
tor_e05 = ["Steeles", "East L'Amoreaux", "L'Amoreaux West", "Tam O'Shanter-Sullivan",
           'Newtonbrook East'] # **ADDED**

tor_e06 = ['Oakridge', 'Birchcliffe-Cliffside']

# East York (O'Connor Area)
tor_e03 = ["O'Connor-Parkview", "Old East York", "Danforth East York", "Danforth", "Broadview North", 
           "Playter Estates-Danforth", "Woodbine-Lumsden",
           'Taylor-Massey'] # **ADDED**
           
tor_e02 = ['The Beaches', 'East End-Danforth', 'Woodbine Corridor', 'Greenwood-Coxwell']
tor_e01 = ['North Riverdale', 'Blake-Jones', 'South Riverdale']
tor_e04 = ['Wexford/Maryvale', 'Dorset Park', 'Ionview', 'Kennedy Park', 'Clairlea-Birchmount']

# Etobicoke North
tor_w10 = ['West Humber-Clairville', 'Mount Olive-Silverstone-Jamestown', 'Rexdale-Kipling', 'Elms-Old Rexdale',
           'Thistletown-Beaumond Heights'] # **ADDED**

# Etobicoke Centre
tor_w09 = ['Kingsview Village-The Westway', 'Humber Heights-Westmount', 
           'Wilowridge-Martingrove-Richview',
           'Willowridge-Martingrove-Richview'] # **ADDED**

# Etobicoke Centre/Lakeshore (Islington Area)
tor_w08 = ['Eringate-Centennial-West Deane', 'Markland Wood', 'Princess-Rosethorn', 
           'Edenbridge-Humber Valley', 'Kingsway South', 'Etobicoke West Mall', 
           'Etobicoke City Centre',
           'Islington'] # **ADDED**

tor_w07 = ['Stonegate-Queensway']

# Etobicoke Lakeshore (Humber Bay Area)
tor_w06 = ['Alderwood', 'Long Branch', 'New Toronto', 'Mimico-Queensway',
           'Humber Bay Shores'] # **ADDED**

# North York West (Jane/Finch & Downsview Areas)
tor_w05 = ['Humber Summit', 'Humbermede', 'Downsview', 
           'Glenfield-Jane Heights', 'Black Creek', 'Fort York-Liberty Village',
           'Oakdale-Beverley Heights', 'Westminster-Branson', 'York University Heights'] # **ADDED**

tor_w04 = ['Pelmo Park-Humberlea', 'Weston', 'Mount Dennis', 'Rustic', 
           'Maple Leaf', 'Yorkdale-Glen Park', 'Briar Hill-Belgravia', 
           'Brookhaven-Amesbury', 'Beechborough-Greenbrook']

# York South-Weston (Oakwood Area)
tor_w03 = ['Rockcliffe-Smythe', 'Keelesdale-Eglinton West', 'Caledonia-Fairbank', 
           'Corso Italia-Davenport', 'Weston-Pelham Park',
           'Oakwood Village'] # **ADDED**

# Parkdale-High Park (Junction Area)
tor_w02 = ['Lambton Baby Point', 'Runnymede-Bloor West Village', 'Junction Area', 
           'High Park North', 'Dovercourt Village',
           'Junction-Wallace Emerson'] # **ADDED**

# Parkdale-High Park (Little Portugal Area)
tor_w01 = ['High Park-Swansea', 'Roncesvalles', 'South Parkdale',
           'Little Portugal'] # **ADDED**

In [10]:
all_classif = [
    (tor_w01, 'Toronto W01'),
    (tor_w02, 'Toronto W02'),
    (tor_w03, 'Toronto W03'),
    (tor_w04, 'Toronto W04'),
    (tor_w05, 'Toronto W05'),
    (tor_w06, 'Toronto W06'),
    (tor_w07, 'Toronto W07'),
    (tor_w08, 'Toronto W08'),
    (tor_w09, 'Toronto W09'),
    (tor_w10, 'Toronto W10'),
    (tor_e01, 'Toronto E01'),
    (tor_e02, 'Toronto E02'),
    (tor_e03, 'Toronto E03'),
    (tor_e04, 'Toronto E04'),
    (tor_e05, 'Toronto E05'),
    (tor_e06, 'Toronto E06'),
    (tor_e07, 'Toronto E07'),
    (tor_e08, 'Toronto E08'),
    (tor_e09, 'Toronto E09'),
    (tor_e10, 'Toronto E10'),
    (tor_e11, 'Toronto E11'), 
    (tor_c01, 'Toronto C01'),
    (tor_c02, 'Toronto C02'),
    (tor_c03, 'Toronto C03'),
    (tor_c04, 'Toronto C04'),
    (tor_c06, 'Toronto C06'),
    (tor_c07, 'Toronto C07'),
    (tor_c08, 'Toronto C08'),
    (tor_c09, 'Toronto C09'),
    (tor_c10, 'Toronto C10'),
    (tor_c11, 'Toronto C11'),
    (tor_c12, 'Toronto C12'),
    (tor_c13, 'Toronto C13'),
    (tor_c14, 'Toronto C14'),
    (tor_c15, 'Toronto C15'),
]

print(all_classif)

[(['High Park-Swansea', 'Roncesvalles', 'South Parkdale', 'Little Portugal'], 'Toronto W01'), (['Lambton Baby Point', 'Runnymede-Bloor West Village', 'Junction Area', 'High Park North', 'Dovercourt Village', 'Junction-Wallace Emerson'], 'Toronto W02'), (['Rockcliffe-Smythe', 'Keelesdale-Eglinton West', 'Caledonia-Fairbank', 'Corso Italia-Davenport', 'Weston-Pelham Park', 'Oakwood Village'], 'Toronto W03'), (['Pelmo Park-Humberlea', 'Weston', 'Mount Dennis', 'Rustic', 'Maple Leaf', 'Yorkdale-Glen Park', 'Briar Hill-Belgravia', 'Brookhaven-Amesbury', 'Beechborough-Greenbrook'], 'Toronto W04'), (['Humber Summit', 'Humbermede', 'Downsview', 'Glenfield-Jane Heights', 'Black Creek', 'Fort York-Liberty Village', 'Oakdale-Beverley Heights', 'Westminster-Branson', 'York University Heights'], 'Toronto W05'), (['Alderwood', 'Long Branch', 'New Toronto', 'Mimico-Queensway', 'Humber Bay Shores'], 'Toronto W06'), (['Stonegate-Queensway'], 'Toronto W07'), (['Eringate-Centennial-West Deane', 'Markland

In [11]:
classif_df = pd.DataFrame(
    all_classif,
    columns=['AREA_NAME_KEY', 'Region_classif'] # none of the area name keys and regions are null so no checks needed 
)
classif_df.head()
con.register('classif_lookup', classif_df)

<_duckdb.DuckDBPyConnection at 0x7f312c7277f0>

In [12]:
# left join for analytics.neighborhood_geo_data
final_classification_query = """
CREATE OR REPLACE TABLE analytics.neighborhood_classified AS
SELECT
    t1.*, -- Select all columns from the main neighborhood table
    t2_flat.Region_classif -- Get the classification ID
FROM analytics.neighborhood_geo_data AS t1
LEFT JOIN 
(
    -- CTE: Flatten the lookup table by unnesting the array column
    SELECT 
        unnest(AREA_NAME_KEY) AS Single_Area_Name, -- UNNEST converts the array into many rows
        Region_classif
    FROM classif_lookup
) AS t2_flat 
    -- Now, join the single area name to the main table's area name
    ON t1.AREA_NAME = t2_flat.Single_Area_Name;
"""

con.sql(final_classification_query)

# Verification Check
con.sql("SELECT AREA_NAME, Region_classif FROM analytics.neighborhood_classified WHERE Region_classif IS NOT NULL").show()
# there are 158 rows, but 159 entries after joining. 

┌───────────────────────────────────┬────────────────┐
│             AREA_NAME             │ Region_classif │
│              varchar              │    varchar     │
├───────────────────────────────────┼────────────────┤
│ South Eglinton-Davisville         │ Toronto C10    │
│ North Toronto                     │ Toronto C04    │
│ Dovercourt Village                │ Toronto W02    │
│ Junction-Wallace Emerson          │ Toronto W02    │
│ Yonge-Bay Corridor                │ Toronto C08    │
│ Bay-Cloverhill                    │ Toronto C08    │
│ Bendale-Glen Andrew               │ Toronto E09    │
│ Downsview                         │ Toronto W05    │
│ Oakdale-Beverley Heights          │ Toronto W05    │
│ Avondale                          │ Toronto C14    │
│    ·                              │      ·         │
│    ·                              │      ·         │
│    ·                              │      ·         │
│ Kingsview Village-The Westway     │ Toronto W09    │
│ Elms-Old

In [13]:
# check for duplicates 
query = f"""
SELECT
    AREA_NAME,
    COUNT(AREA_NAME) AS duplicate_count
FROM analytics.neighborhood_classified
GROUP BY AREA_NAME
HAVING COUNT(AREA_NAME) > 1
ORDER BY duplicate_count DESC;
"""

con.sql(query).show()

┌───────────┬─────────────────┐
│ AREA_NAME │ duplicate_count │
│  varchar  │      int64      │
├───────────┴─────────────────┤
│           0 rows            │
└─────────────────────────────┘



In [14]:
# strategy: use the classif table and do left joins again with the target variable 

# load the rentals first
csv_load_query = """
CREATE OR REPLACE TABLE analytics.rental_table AS
SELECT * FROM read_csv_auto('source-files/rental_data_extraction.csv');
""" 

con.sql(csv_load_query)

join_query = """
CREATE OR REPLACE TABLE analytics.neighborhoods AS
SELECT
    t1.*, -- all the rows of t1 
    t2.*
FROM analytics.neighborhood_classified AS t1
LEFT JOIN 
    analytics.rental_table AS t2 
    ON t1.Region_classif = t2.Area
"""

con.sql(join_query)
con.sql("PRAGMA table_info('analytics.neighborhoods')").show()

┌───────┬───────────────────────────┬─────────┬─────────┬────────────┬─────────┐
│  cid  │           name            │  type   │ notnull │ dflt_value │   pk    │
│ int32 │          varchar          │ varchar │ boolean │  varchar   │ boolean │
├───────┼───────────────────────────┼─────────┼─────────┼────────────┼─────────┤
│     0 │ _id                       │ BIGINT  │ false   │ NULL       │ false   │
│     1 │ AREA_ID                   │ BIGINT  │ false   │ NULL       │ false   │
│     2 │ AREA_ATTR_ID              │ BIGINT  │ false   │ NULL       │ false   │
│     3 │ PARENT_AREA_ID            │ JSON    │ false   │ NULL       │ false   │
│     4 │ AREA_SHORT_CODE           │ VARCHAR │ false   │ NULL       │ false   │
│     5 │ AREA_LONG_CODE            │ VARCHAR │ false   │ NULL       │ false   │
│     6 │ AREA_NAME                 │ VARCHAR │ false   │ NULL       │ false   │
│     7 │ AREA_DESC                 │ VARCHAR │ false   │ NULL       │ false   │
│     8 │ CLASSIFICATION    

In [None]:
# create the tables for transit, crime and parks 
