# Master Dataset Pipeline
## Processing 14 Years of Toronto Open Data (2010-2024)

This notebook processes all CSV files from `data/raw_data` into a unified longitudinal dataset optimized for XGBoost time-series forecasting.


In [3]:
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set paths (notebook is in data/ directory)
RAW_DATA_DIR = Path('raw_data')
PROCESSED_DATA_DIR = Path('processed_data')
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

print("Setup complete!")


Setup complete!


## Step 1: Data Ingestion & Schema Alignment


In [4]:
# Step 1: Loop through all CSV files and standardize
all_dataframes = []
years_processed = []

# Get all CSV files matching the pattern 20XX.csv
csv_files = sorted([f for f in RAW_DATA_DIR.glob('20*.csv')])
print(f"Found {len(csv_files)} CSV files to process")

for csv_file in csv_files:
    # Extract year from filename
    year_match = re.search(r'(\d{4})\.csv$', csv_file.name)
    if not year_match:
        print(f"Warning: Could not extract year from {csv_file.name}, skipping")
        continue
    
    year = int(year_match.group(1))
    print(f"\nProcessing {year}...")
    
    try:
        df = pd.read_csv(csv_file, low_memory=False)
        
        # Standardize AREA_NAME column
        # Find the area name column (could be AREA_NAME, Area, or similar)
        area_col = None
        for col in df.columns:
            if 'area' in col.lower() and 'name' in col.lower():
                area_col = col
                break
        
        if area_col and area_col != 'AREA_NAME':
            df = df.rename(columns={area_col: 'AREA_NAME'})
        
        if 'AREA_NAME' not in df.columns:
            print(f"  Warning: No AREA_NAME column found in {year}, using first column")
            df = df.rename(columns={df.columns[0]: 'AREA_NAME'})
        
        # Clean AREA_NAME: strip whitespace and convert to consistent casing
        df['AREA_NAME'] = df['AREA_NAME'].astype(str).str.strip().str.title()
        
        # Remove garbage rows (rows containing '=======' or similar metadata)
        mask = df['AREA_NAME'].str.contains('={3,}', na=False, regex=True)
        df = df[~mask]
        
        # Remove rows where AREA_NAME is empty or NaN
        df = df[df['AREA_NAME'].notna() & (df['AREA_NAME'] != '')]
        
        # Add YEAR column
        df['YEAR'] = year
        
        all_dataframes.append(df)
        years_processed.append(year)
        print(f"  Loaded {len(df)} rows")
        
    except Exception as e:
        print(f"  Error processing {year}: {e}")
        continue

print(f"\n\nTotal years processed: {len(years_processed)}")
print(f"Years: {sorted(years_processed)}")


Found 15 CSV files to process

Processing 2010...
  Loaded 158 rows

Processing 2011...
  Loaded 158 rows

Processing 2012...
  Loaded 158 rows

Processing 2013...
  Loaded 158 rows

Processing 2014...
  Loaded 158 rows

Processing 2015...
  Loaded 158 rows

Processing 2016...
  Loaded 158 rows

Processing 2017...
  Loaded 317 rows

Processing 2018...
  Loaded 158 rows

Processing 2019...
  Loaded 158 rows

Processing 2020...
  Loaded 158 rows

Processing 2021...
  Loaded 158 rows

Processing 2022...
  Loaded 158 rows

Processing 2023...
  Loaded 158 rows

Processing 2024...
  Loaded 158 rows


Total years processed: 15
Years: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]


In [5]:
# Step 2: Strip year suffixes from columns and normalize
normalized_dfs = []

for df in all_dataframes:
    year = df['YEAR'].iloc[0]
    df_normalized = df.copy()
    
    # Create mapping for column renaming
    rename_dict = {}
    
    for col in df.columns:
        # Skip YEAR and AREA_NAME columns
        if col in ['YEAR', 'AREA_NAME']:
            continue
        
        # Check if column ends with _20XX pattern
        year_pattern = re.search(r'_(\d{4})$', col)
        if year_pattern:
            # Strip the year suffix
            new_col = col[:-5]  # Remove '_YYYY'
            rename_dict[col] = new_col
    
    # Apply renaming
    df_normalized = df_normalized.rename(columns=rename_dict)
    
    normalized_dfs.append(df_normalized)

print(f"Normalized {len(normalized_dfs)} dataframes")


Normalized 15 dataframes


## Step 3: Advanced Rent Feature Engineering


In [6]:
# Step 3: Rent feature engineering
rent_engineered_dfs = []

for df in normalized_dfs:
    df_rent = df.copy()
    
    # Find 1-BR rent columns that might be split by quarter
    # Look for columns like: 1_bedroom_avg_lease_rate_q1, q2, q3, q4
    q1_cols = [col for col in df_rent.columns if '1' in col.lower() and 'bedroom' in col.lower() 
               and 'avg' in col.lower() and ('lease' in col.lower() or 'rent' in col.lower())
               and ('q1' in col.lower() or 'quarter' in col.lower())]
    
    # Also check for standard naming: 1_bed_room_avg_lease_rate, 1_bedrooms_avg_lease_rate
    standard_1br_cols = [col for col in df_rent.columns if 
                        (('1_bed' in col.lower() or '1bed' in col.lower()) 
                         and ('avg' in col.lower() or 'average' in col.lower())
                         and ('lease' in col.lower() or 'rent' in col.lower()))]
    
    # If we have quarterly columns, aggregate them
    if q1_cols:
        # Find all quarter columns
        quarter_cols = []
        for q in ['q1', 'q2', 'q3', 'q4']:
            q_cols = [col for col in df_rent.columns if q in col.lower() and '1' in col.lower() 
                     and 'bedroom' in col.lower() and 'avg' in col.lower()]
            quarter_cols.extend(q_cols)
        
        if quarter_cols:
            # Calculate row-wise mean
            df_rent['avg_rent_1br'] = df_rent[quarter_cols].mean(axis=1)
        else:
            # Use standard column if available
            if standard_1br_cols:
                df_rent['avg_rent_1br'] = df_rent[standard_1br_cols[0]]
            else:
                df_rent['avg_rent_1br'] = np.nan
    else:
        # Use standard column naming
        if standard_1br_cols:
            df_rent['avg_rent_1br'] = df_rent[standard_1br_cols[0]]
        else:
            # Try alternative naming patterns
            alt_cols = [col for col in df_rent.columns if '1' in str(col) and 'bed' in str(col).lower() 
                       and ('rate' in str(col).lower() or 'rent' in str(col).lower())]
            if alt_cols:
                df_rent['avg_rent_1br'] = df_rent[alt_cols[0]]
            else:
                df_rent['avg_rent_1br'] = np.nan
    
    # Keep other rent types
    # Bachelor rent
    bachelor_cols = [col for col in df_rent.columns if 'bachelor' in col.lower() 
                    and ('avg' in col.lower() or 'average' in col.lower())
                    and ('lease' in col.lower() or 'rent' in col.lower())]
    if bachelor_cols:
        df_rent['bachelor_avg_lease_rate'] = df_rent[bachelor_cols[0]]
    
    # 2-bedroom rent
    two_br_cols = [col for col in df_rent.columns if '2' in col and 'bedroom' in col.lower() 
                  and ('avg' in col.lower() or 'average' in col.lower())
                  and ('lease' in col.lower() or 'rent' in col.lower())]
    if two_br_cols:
        df_rent['2_bedrooms_avg_lease_rate'] = df_rent[two_br_cols[0]]
    
    # Volume tracking: 1_bedrooms_leased
    leased_cols = [col for col in df_rent.columns if '1' in col and 'bedroom' in col.lower() 
                  and 'leased' in col.lower()]
    if leased_cols:
        df_rent['1_bedrooms_leased'] = df_rent[leased_cols[0]]
    
    rent_engineered_dfs.append(df_rent)

print(f"\nAfter Step 3, column count check:")
if rent_engineered_dfs:
    print(f"Sample dataframe has {len(rent_engineered_dfs[0].columns)} columns")
    print(f"Columns: {list(rent_engineered_dfs[0].columns)[:15]}...")



After Step 3, column count check:
Sample dataframe has 43 columns
Columns: ['AREA_NAME', 'CLASSIFICATION', 'CLASSIFICATION_CODE', 'geometry_wkt', 'geometry_type', 'Area', 'bachelor_avg_lease_rate', '1_bedrooms_leased', '1_bed_room_avg_lease_rate', '2_bedrooms_leased', '2_bedrooms_avg_lease_rate', '3_bedrooms_leased', '3_bedrooms_avg_lease_rate', 'area_sq_meters', 'perimeter_meters']...


## Step 4: Population and Geography Proxying


In [7]:
# Step 4: Use 2024 as source of truth for static geographic data
# Find 2024 dataframe
df_2024 = None
for df in rent_engineered_dfs:
    if df['YEAR'].iloc[0] == 2024:
        df_2024 = df.copy()
        break

if df_2024 is None:
    raise ValueError("2024 dataset not found!")

# Extract static geographic columns from 2024
static_cols = ['POPULATION', 'area_sq_meters', 'transit_line_density', 
               'avg_stop_frequency', 'distinct_route_count']

# Find actual column names (may have variations)
static_mapping = {}
for target_col in static_cols:
    # Try exact match first
    if target_col in df_2024.columns:
        static_mapping[target_col] = target_col
    else:
        # Try case-insensitive and partial matches
        matches = [col for col in df_2024.columns 
                 if target_col.lower() in col.lower() or col.lower() in target_col.lower()]
        if matches:
            static_mapping[target_col] = matches[0]
        else:
            print(f"Warning: Could not find column matching '{target_col}'")

print(f"Static columns mapping: {static_mapping}")

# Create lookup dictionary: AREA_NAME -> static values
static_lookup = {}
for idx, row in df_2024.iterrows():
    area_name = row['AREA_NAME']
    static_lookup[area_name] = {}
    for target_col, source_col in static_mapping.items():
        static_lookup[area_name][target_col] = row[source_col]

# Broadcast 2024 values to all previous years
geography_proxied_dfs = []
for df in rent_engineered_dfs:
    df_geo = df.copy()
    
    # Add static columns
    for target_col in static_cols:
        if target_col not in df_geo.columns:
            df_geo[target_col] = df_geo['AREA_NAME'].map(
                lambda x: static_lookup.get(x, {}).get(target_col, np.nan) if pd.notna(x) else np.nan
            )
    
    geography_proxied_dfs.append(df_geo)

print(f"\nBroadcasted static geography data to {len(geography_proxied_dfs)} dataframes")


Static columns mapping: {'POPULATION': 'POPULATION', 'area_sq_meters': 'area_sq_meters', 'transit_line_density': 'transit_line_density', 'avg_stop_frequency': 'avg_stop_frequency', 'distinct_route_count': 'distinct_route_count'}

Broadcasted static geography data to 15 dataframes


## Step 5: Temporal Feature Engineering (The "Memory")


In [8]:
# Step 5: Combine all dataframes and create temporal features
# Combine all dataframes
combined_df = pd.concat(geography_proxied_dfs, ignore_index=True)

# Sort by AREA_NAME and YEAR
combined_df = combined_df.sort_values(['AREA_NAME', 'YEAR']).reset_index(drop=True)

print(f"Combined dataframe shape: {combined_df.shape}")
print(f"Unique neighborhoods: {combined_df['AREA_NAME'].nunique()}")
print(f"Year range: {combined_df['YEAR'].min()} - {combined_df['YEAR'].max()}")

# Create lag features grouped by AREA_NAME
combined_df['rent_lag_1'] = combined_df.groupby('AREA_NAME')['avg_rent_1br'].shift(1)
combined_df['rent_lag_2'] = combined_df.groupby('AREA_NAME')['avg_rent_1br'].shift(2)

# Calculate rent growth rate
combined_df['rent_growth_rate'] = (
    (combined_df['avg_rent_1br'] - combined_df['rent_lag_1']) / combined_df['rent_lag_1'] * 100
)
# Handle division by zero
combined_df['rent_growth_rate'] = combined_df['rent_growth_rate'].replace([np.inf, -np.inf], np.nan)

print(f"\nTemporal features created:")
print(f"  - rent_lag_1: {combined_df['rent_lag_1'].notna().sum()} non-null values")
print(f"  - rent_lag_2: {combined_df['rent_lag_2'].notna().sum()} non-null values")
print(f"  - rent_growth_rate: {combined_df['rent_growth_rate'].notna().sum()} non-null values")


Combined dataframe shape: (2529, 93)
Unique neighborhoods: 159
Year range: 2010 - 2024

Temporal features created:
  - rent_lag_1: 2370 non-null values
  - rent_lag_2: 2212 non-null values
  - rent_growth_rate: 2370 non-null values


## Step 6: Creating the Machine Learning Label (The "Answer")


In [9]:
# Step 6: Create TARGET_RENT_5YR column
# For each row, find the avg_rent_1br value from 5 years later

# Create a lookup: (AREA_NAME, YEAR) -> avg_rent_1br
rent_lookup = {}
for idx, row in combined_df.iterrows():
    key = (row['AREA_NAME'], row['YEAR'])
    rent_lookup[key] = row['avg_rent_1br']

# Create TARGET_RENT_5YR
combined_df['TARGET_RENT_5YR'] = combined_df.apply(
    lambda row: rent_lookup.get((row['AREA_NAME'], row['YEAR'] + 5), np.nan),
    axis=1
)

# Verification: Check a 2012 row shows 2017 rent
print("\nVerification - Sample 2012 rows:")
sample_2012 = combined_df[combined_df['YEAR'] == 2012][['AREA_NAME', 'YEAR', 'avg_rent_1br', 'TARGET_RENT_5YR']].head()
print(sample_2012)

print("\nVerification - Sample 2017 rows (should match TARGET_RENT_5YR from 2012):")
if len(sample_2012) > 0:
    test_area = sample_2012.iloc[0]['AREA_NAME']
    sample_2017 = combined_df[(combined_df['YEAR'] == 2017) & (combined_df['AREA_NAME'] == test_area)]
    if len(sample_2017) > 0:
        print(f"Area: {test_area}")
        print(f"2012 TARGET_RENT_5YR: {sample_2012.iloc[0]['TARGET_RENT_5YR']}")
        print(f"2017 avg_rent_1br: {sample_2017.iloc[0]['avg_rent_1br']}")
        print(f"Match: {abs(sample_2012.iloc[0]['TARGET_RENT_5YR'] - sample_2017.iloc[0]['avg_rent_1br']) < 0.01 if pd.notna(sample_2012.iloc[0]['TARGET_RENT_5YR']) else 'N/A (NaN)'}")

print(f"\nTARGET_RENT_5YR statistics:")
print(f"  Non-null values: {combined_df['TARGET_RENT_5YR'].notna().sum()}")
print(f"  Null values: {combined_df['TARGET_RENT_5YR'].isna().sum()}")



Verification - Sample 2012 rows:
                       AREA_NAME  YEAR  avg_rent_1br  TARGET_RENT_5YR
2                Agincourt North  2012       1272.75      1644.833333
18  Agincourt South-Malvern West  2012       1272.75      1644.833333
34                     Alderwood  2012       1520.50      2141.166667
50                         Annex  2012       2045.75      3742.000000
66                      Avondale  2012       1490.00      2175.000000

Verification - Sample 2017 rows (should match TARGET_RENT_5YR from 2012):
Area: Agincourt North
2012 TARGET_RENT_5YR: 1644.8333333333333
2017 avg_rent_1br: 1644.8333333333333
Match: True

TARGET_RENT_5YR statistics:
  Non-null values: 1738
  Null values: 791


## Step 7: Separation for GIS (Map Key)


In [10]:
# Step 7: Extract geometry columns for map key
geometry_cols = ['AREA_NAME', 'geometry_wkt', 'geometry_type', 'CLASSIFICATION']

# Get geometry data from 2024 (most complete)
map_key_df = df_2024[geometry_cols].copy() if all(col in df_2024.columns for col in geometry_cols) else pd.DataFrame()

# If 2024 doesn't have all columns, try to get from combined_df
if map_key_df.empty or len(map_key_df) == 0:
    available_geo_cols = [col for col in geometry_cols if col in combined_df.columns]
    if available_geo_cols:
        # Get unique combinations (prefer latest year)
        map_key_df = combined_df.sort_values('YEAR', ascending=False).groupby('AREA_NAME').first()[available_geo_cols].reset_index()

# Ensure AREA_NAME is in the result
if 'AREA_NAME' not in map_key_df.columns and len(map_key_df) > 0:
    map_key_df = map_key_df.reset_index()

print(f"Map key dataframe shape: {map_key_df.shape}")
print(f"Map key columns: {list(map_key_df.columns)}")

# Drop geometry columns from main training dataframe
geometry_cols_to_drop = ['geometry_wkt', 'geometry_type']
for col in geometry_cols_to_drop:
    if col in combined_df.columns:
        combined_df = combined_df.drop(columns=[col])

print(f"\nDropped geometry columns from main dataframe")
print(f"Main dataframe shape after dropping geometry: {combined_df.shape}")


Map key dataframe shape: (158, 4)
Map key columns: ['AREA_NAME', 'geometry_wkt', 'geometry_type', 'CLASSIFICATION']

Dropped geometry columns from main dataframe
Main dataframe shape after dropping geometry: (2529, 95)


## Step 8: Final Integrity & Export


In [12]:
# Step 8: Final filtering and export
# Remove rows where avg_rent_1br is missing
final_df = combined_df[combined_df['avg_rent_1br'].notna()].copy()

print(f"Final dataframe shape: {final_df.shape}")
print(f"Rows removed due to missing avg_rent_1br: {len(combined_df) - len(final_df)}")
print(f"Unique neighborhoods in final dataset: {final_df['AREA_NAME'].nunique()}")
print(f"Year range in final dataset: {final_df['YEAR'].min()} - {final_df['YEAR'].max()}")

# Verify we have all 158 neighborhoods (or close to it)
print(f"\nNeighborhood count: {final_df['AREA_NAME'].nunique()}")
print(f"Expected: ~158 neighborhoods")

# Export main training file
output_file = PROCESSED_DATA_DIR / 'toronto_master_2010_2024.csv'
final_df.to_csv(output_file, index=False)
print(f"\nExported main training file: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")

# Export map key file
if not map_key_df.empty:
    map_key_file = PROCESSED_DATA_DIR / 'toronto_map_key.csv'
    map_key_df.to_csv(map_key_file, index=False)
    print(f"Exported map key file: {map_key_file}")
    print(f"Map key shape: {map_key_df.shape}")
else:
    print("Warning: Map key dataframe is empty, skipping export")

print("\n✅ Pipeline complete!")


Final dataframe shape: (2528, 95)
Rows removed due to missing avg_rent_1br: 1
Unique neighborhoods in final dataset: 158
Year range in final dataset: 2010 - 2024

Neighborhood count: 158
Expected: ~158 neighborhoods

Exported main training file: processed_data/toronto_master_2010_2024.csv
File size: 6.13 MB
Exported map key file: processed_data/toronto_map_key.csv
Map key shape: (158, 4)

✅ Pipeline complete!


## Summary Statistics


In [13]:
# Display summary statistics
print("=== FINAL DATASET SUMMARY ===")
print(f"\nShape: {final_df.shape}")
print(f"\nColumns ({len(final_df.columns)}):")
for i, col in enumerate(final_df.columns, 1):
    print(f"  {i}. {col}")

print(f"\n\nYear distribution:")
print(final_df['YEAR'].value_counts().sort_index())

print(f"\n\nNeighborhoods per year:")
print(final_df.groupby('YEAR')['AREA_NAME'].nunique())

print(f"\n\nSample data (first 5 rows):")
print(final_df.head())

print(f"\n\nMissing values:")
missing = final_df.isnull().sum()
print(missing[missing > 0])


=== FINAL DATASET SUMMARY ===

Shape: (2528, 95)

Columns (95):
  1. AREA_NAME
  2. CLASSIFICATION
  3. CLASSIFICATION_CODE
  4. Area
  5. bachelor_avg_lease_rate
  6. 1_bedrooms_leased
  7. 1_bed_room_avg_lease_rate
  8. 2_bedrooms_leased
  9. 2_bedrooms_avg_lease_rate
  10. 3_bedrooms_leased
  11. 3_bedrooms_avg_lease_rate
  12. area_sq_meters
  13. perimeter_meters
  14. park_count
  15. ASSAULT
  16. ASSAULT_RATE
  17. AUTOTHEFT
  18. AUTOTHEFT_RATE
  19. BIKETHEFT_RATE
  20. BREAKENTER
  21. HOMICIDE
  22. HOMICIDE_RATE
  23. ROBBERY
  24. ROBBERY_RATE
  25. SHOOTING
  26. SHOOTING_RATE
  27. THEFTFROMMV
  28. THEFTFROMMV_RATE
  29. THEFTOVER
  30. THEFTOVER_RATE
  31. POPULATION
  32. total_stop_count
  33. avg_stop_frequency
  34. max_stop_frequency
  35. total_line_length_meters
  36. transit_line_density
  37. distinct_route_count
  38. AreaBachelor Leased
  39. Bachelor Leased
  40. YEAR
  41. avg_rent_1br
  42. bachelor_leased_q4
  43. bachelor_avg_lease_rate_q4
  44. 1_bedr