In [1]:
import pandas as pd
import numpy as np
import re
import os
import glob
import matplotlib.pyplot as plt

# Create output folder for processed files
output_dir = 'processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Environment Ready. Processed data will be saved to:", output_dir)

Environment Ready. Processed data will be saved to: processed_data


In [1]:
##LATEST ONE I PROCESSED:

import pandas as pd
import numpy as np
import re
import os
import glob

# 1. SETUP
output_dir = 'processed_data'
if not os.path.exists(output_dir): os.makedirs(output_dir)

# 2. MASTER REFERENCE (2024)
df_2024 = pd.read_csv('2024.csv')
df_2024['AREA_NAME'] = df_2024['AREA_NAME'].str.strip()
pop_lookup = df_2024[['AREA_NAME', 'POPULATION_2024', 'area_sq_meters']].copy()
pop_lookup.columns = ['AREA_NAME', 'MASTER_POP', 'MASTER_SQ_METERS']

# 3. EXPANDED FEATURE LIST
features_to_track = [
    'AREA_NAME', 'YEAR', 'avg_rent_1br', 'POPULATION',
    'ASSAULT_RATE', 'AUTOTHEFT_RATE', 'HOMICIDE_RATE', 
    'transit_line_density', 'avg_stop_frequency', 'distinct_route_count',
    '1_bedrooms_leased', 'bachelor_avg_lease_rate', '2_bedrooms_avg_lease_rate',
    'area_sq_meters'
]

master_list = []
all_csvs = sorted(glob.glob("20*.csv")) 

for f_path in all_csvs:
    year = int(re.findall(r'\d+', os.path.basename(f_path))[0])
    if year > 2024: continue
    
    df = pd.read_csv(f_path)
    df = df.rename(columns={'Area': 'AREA_NAME'})
    df['AREA_NAME'] = df['AREA_NAME'].str.strip()
    
    # Standardize Rent Logic
    if '1_bed_room_avg_lease_rate' in df.columns:
        df['avg_rent_1br'] = df['1_bed_room_avg_lease_rate']
    
    # Strip Year Suffixes
    df.columns = [re.sub(rf'_{year}$', '', col) for col in df.columns]
    df['YEAR'] = year
    
    # Apply Geography/Population Proxies
    df = pd.merge(df, pop_lookup, on='AREA_NAME', how='left')
    df['POPULATION'] = df['MASTER_POP']
    df['area_sq_meters'] = df['MASTER_SQ_METERS']

    available = [f for f in features_to_track if f in df.columns]
    master_list.append(df[available].copy())

# 4. TEMPORAL ENGINEERING
full_df = pd.concat(master_list, ignore_index=True).sort_values(['AREA_NAME', 'YEAR'])

# Lags & Target
full_df['rent_lag_1'] = full_df.groupby('AREA_NAME')['avg_rent_1br'].shift(1)
full_df['rent_lag_2'] = full_df.groupby('AREA_NAME')['avg_rent_1br'].shift(2)
rent_dict = full_df.set_index(['AREA_NAME', 'YEAR'])['avg_rent_1br'].to_dict()
full_df['TARGET_RENT_5YR'] = full_df.apply(lambda x: rent_dict.get((x['AREA_NAME'], x['YEAR'] + 5), np.nan), axis=1)

# 5. SAVE
full_df.to_csv('processed_data/toronto_master_2010_2024_no19.csv', index=False)


print("Updated Master CSV and Map Key Generated.")

KeyError: "['POPULATION'] not in index"

In [2]:
# Load 2024 reference for population stand-ins
df_24_ref = pd.read_csv('2024.csv')

# Standardize column name for the lookup
pop_proxy = df_24_ref[['AREA_NAME', 'POPULATION_2024']].copy()
pop_proxy.columns = ['AREA_NAME', 'POPULATION']

print(f"Population proxy created for {len(pop_proxy)} neighborhoods.")

Population proxy created for 158 neighborhoods.


In [3]:
# Features to track across the decade
features_to_track = [
    'AREA_NAME', 'YEAR', 'avg_rent_1br', 'POPULATION',
    'ASSAULT_RATE', 'AUTOTHEFT_RATE', 'BIKETHEFT_RATE', 
    'BREAKENTER_RATE', 'HOMICIDE_RATE', 'ROBBERY_RATE', 
    'SHOOTING_RATE', 'THEFTFROMMV_RATE', 'THEFTOVER_RATE',
    'park_count', 'transit_line_density', 'total_stop_count'
]

master_list = []

# List of files you provided
files = ['2010.csv', '2011.csv', '2012.csv', '2013.csv', '2014.csv', 
         '2015.csv', '2016.csv', '2017.csv', '2018.csv', '2022.csv', 
         '2023.csv', '2024.csv']

for f_path in files:
    if not os.path.exists(f_path):
        print(f"Skipping {f_path}: File not found.")
        continue
        
    year = int(re.findall(r'\d+', f_path)[0])
    df = pd.read_csv(f_path)
    
    # 1. Standardize Neighborhood column name
    if 'Area' in df.columns and 'AREA_NAME' not in df.columns:
        df = df.rename(columns={'Area': 'AREA_NAME'})
    
    # 2. Fix "1_bed_room" naming variation
    df.columns = [c.replace('1_bed_room', '1_bedroom') for c in df.columns]
    
    # 3. Handle Rent: Average quarters if they exist, otherwise use direct column
    q_cols = [c for c in df.columns if '1_bedroom_avg_lease_rate_q' in c]
    if q_cols:
        df['avg_rent_1br'] = df[q_cols].mean(axis=1)
    elif '1_bedroom_avg_lease_rate' in df.columns:
        df['avg_rent_1br'] = df['1_bedroom_avg_lease_rate']
    else:
        df['avg_rent_1br'] = np.nan

    # 4. Remove year suffixes (e.g., ASSAULT_RATE_2017 -> ASSAULT_RATE)
    df.columns = [re.sub(rf'_{year}$', '', col) for col in df.columns]
    df['YEAR'] = year

    # 5. Impute missing Population using proxy
    if 'POPULATION' not in df.columns:
        df = pd.merge(df, pop_proxy, on='AREA_NAME', how='left')

    # 6. Keep only desired features
    available = [f for f in features_to_track if f in df.columns]
    master_list.append(df[available].copy())
    print(f"Processed {year}: {len(available)} features captured.")

# Merge all years
master_df = pd.concat(master_list, ignore_index=True)
print("\nMaster Data successfully merged!")

Processed 2010: 15 features captured.
Processed 2011: 15 features captured.
Processed 2012: 15 features captured.
Processed 2013: 15 features captured.
Processed 2014: 15 features captured.
Processed 2015: 15 features captured.
Processed 2016: 15 features captured.
Processed 2017: 15 features captured.
Processed 2018: 15 features captured.
Processed 2022: 16 features captured.
Processed 2023: 16 features captured.
Processed 2024: 15 features captured.

Master Data successfully merged!


In [4]:
# 1. Sort the data to ensure the shift happens in the correct chronological order
master_df = master_df.sort_values(['AREA_NAME', 'YEAR'])

# 2. CREATE THE AI TARGET: Shift the rent by -5 years for each neighborhood.
# This puts the rent from 2015 into the row for 2010, so the AI can learn the pattern.
master_df['TARGET_RENT_5YR'] = master_df.groupby('AREA_NAME')['avg_rent_1br'].shift(-5)

# 3. SAVE THE MASTER CSV TO THE PROCESSED FOLDER
output_path = os.path.join(output_dir, 'master_training_data.csv')
master_df.to_csv(output_path, index=False)

print(f"File successfully created at: {output_path}")

# 4. PREVIEW THE DATA
# Look at the 'avg_rent_1br' vs 'TARGET_RENT_5YR' to see the shift in action
master_df[['AREA_NAME', 'YEAR', 'avg_rent_1br', 'TARGET_RENT_5YR']].head(10)

File successfully created at: processed_data/master_training_data.csv


Unnamed: 0,AREA_NAME,YEAR,avg_rent_1br,TARGET_RENT_5YR
1264,=======,2017,,
155,Agincourt North,2010,1113.333333,1342.25
313,Agincourt North,2011,1224.666667,1416.5
471,Agincourt North,2012,1272.75,1570.25
629,Agincourt North,2013,1318.25,1570.25
787,Agincourt North,2014,1301.5,1737.0
945,Agincourt North,2015,1342.25,2139.75
1103,Agincourt North,2016,1416.5,2380.75
1261,Agincourt North,2017,1570.25,2220.0
1420,Agincourt North,2017,1570.25,


In [5]:
import pandas as pd
import numpy as np
import os

# 1. Load the "wrong" file
df = pd.read_csv('processed_data/master_training_data.csv')

# 2. Remove the garbage "=======" row
df = df[df['AREA_NAME'].str.contains('^[a-zA-Z]', na=False)]

# 3. Handle Duplicates (e.g., the double 2017 issue)
# We group by neighborhood and year and take the first valid entry
df = df.groupby(['AREA_NAME', 'YEAR']).first().reset_index()

# 4. Fix Population/Crime Zeros
# Replace 0.0 with NaN so we can actually fill them
df = df.replace(0.0, np.nan)

# Use 'Forward Fill' then 'Backward Fill' per neighborhood to close the gaps
# This assumes that if Population was X in 2010 and we don't know 2011, it's still roughly X.
df = df.sort_values(['AREA_NAME', 'YEAR'])
cols_to_fill = [col for col in df.columns if col not in ['AREA_NAME', 'YEAR']]
df[cols_to_fill] = df.groupby('AREA_NAME')[cols_to_fill].ffill().bfill()

# 5. Re-Calculate the 5-Year Target
# Shifting again now that the duplicates are gone ensures the math is correct
df['TARGET_RENT_5YR'] = df.groupby('AREA_NAME')['avg_rent_1br'].shift(-5)

# 6. Save the FIXED file
fixed_path = 'processed_data/master_training_data_FIXED.csv'
df.to_csv(fixed_path, index=False)

print(f"Cleanup Complete! Use this file for the AI: {fixed_path}")
print(f"Total neighborhoods processed: {df['AREA_NAME'].nunique()}")
df.head(10)

Cleanup Complete! Use this file for the AI: processed_data/master_training_data_FIXED.csv
Total neighborhoods processed: 158


Unnamed: 0,AREA_NAME,YEAR,avg_rent_1br,POPULATION,ASSAULT_RATE,AUTOTHEFT_RATE,BIKETHEFT_RATE,HOMICIDE_RATE,ROBBERY_RATE,SHOOTING_RATE,THEFTFROMMV_RATE,THEFTOVER_RATE,park_count,transit_line_density,total_stop_count,BREAKENTER_RATE,TARGET_RENT_5YR
0,Agincourt North,2010,1113.333333,30426.0,221.209717,56.127838,13.206551,3.301638,118.858955,3.342581,208.003174,3.301638,10.0,1.753847,81.0,81.914055,1342.25
1,Agincourt North,2011,1224.666667,30426.0,221.209717,56.127838,13.206551,3.301638,118.858955,3.342581,208.003174,3.301638,10.0,1.753847,81.0,81.914055,1416.5
2,Agincourt North,2012,1272.75,30426.0,221.209717,56.127838,13.206551,3.301638,118.858955,3.342581,208.003174,3.301638,10.0,1.753847,81.0,81.914055,1570.25
3,Agincourt North,2013,1318.25,30426.0,221.209717,56.127838,13.206551,3.301638,118.858955,3.342581,208.003174,3.301638,10.0,1.753847,81.0,81.914055,1737.0
4,Agincourt North,2014,1301.5,30426.0,221.209717,56.127838,13.206551,3.301638,118.858955,3.342581,208.003174,3.301638,10.0,1.753847,81.0,81.914055,2139.75
5,Agincourt North,2015,1342.25,30426.0,257.275543,96.895988,23.388687,3.301638,123.625916,3.342581,157.03833,20.047445,10.0,1.753847,81.0,81.914055,2380.75
6,Agincourt North,2016,1416.5,30426.0,274.091644,60.166462,6.685162,3.301638,50.138718,3.342581,160.443893,16.712906,10.0,1.753847,81.0,81.914055,2220.0
7,Agincourt North,2017,1570.25,30426.0,252.993759,111.317253,6.685162,3.301638,74.211502,3.37325,323.832001,37.105751,10.0,1.753847,81.0,81.914055,
8,Agincourt North,2018,1737.0,30426.0,272.025574,142.813431,3.40032,3.301638,149.614059,3.37325,234.622055,13.601278,10.0,1.753847,81.0,81.914055,
9,Agincourt North,2022,2139.75,30426.0,266.220703,167.241196,3.40032,3.301638,37.543945,3.413086,153.588867,10.239257,10.0,1.753847,81.0,81.914055,


In [6]:
import pandas as pd

# 1. Load your fixed data
df = pd.read_csv('processed_data/master_training_data_FIXED.csv')
df = df.sort_values(['AREA_NAME', 'YEAR'])

# 2. CREATE LAG FEATURES
# This tells the model: "Here is what the rent was in the past"
df['rent_lag_1'] = df.groupby('AREA_NAME')['avg_rent_1br'].shift(1)
df['rent_lag_2'] = df.groupby('AREA_NAME')['avg_rent_1br'].shift(2)

# 3. CREATE TREND FEATURE
# This calculates the % change over the last year
df['rent_growth_rate'] = (df['avg_rent_1br'] - df['rent_lag_1']) / df['rent_lag_1']

# 4. SAVE THE FINAL VERSION
final_path = 'processed_data/master_training_data_FINAL.csv'
df.to_csv(final_path, index=False)

print("Lag Features Added!")
print("New columns: rent_lag_1, rent_lag_2, rent_growth_rate")
df[['AREA_NAME', 'YEAR', 'rent_lag_2', 'rent_lag_1', 'avg_rent_1br']].head(10)

Lag Features Added!
New columns: rent_lag_1, rent_lag_2, rent_growth_rate


Unnamed: 0,AREA_NAME,YEAR,rent_lag_2,rent_lag_1,avg_rent_1br
0,Agincourt North,2010,,,1113.333333
1,Agincourt North,2011,,1113.333333,1224.666667
2,Agincourt North,2012,1113.333333,1224.666667,1272.75
3,Agincourt North,2013,1224.666667,1272.75,1318.25
4,Agincourt North,2014,1272.75,1318.25,1301.5
5,Agincourt North,2015,1318.25,1301.5,1342.25
6,Agincourt North,2016,1301.5,1342.25,1416.5
7,Agincourt North,2017,1342.25,1416.5,1570.25
8,Agincourt North,2018,1416.5,1570.25,1737.0
9,Agincourt North,2022,1570.25,1737.0,2139.75


In [7]:
import pandas as pd
import numpy as np
import os

# 1. Load the current final file
df = pd.read_csv('processed_data/master_training_data_FINAL.csv')

# 2. TRASH CLEANUP: Remove garbage rows (like the '=======' row)
# We only keep rows where the AREA_NAME starts with a letter
df = df[df['AREA_NAME'].str.contains('^[a-zA-Z]', na=False)].copy()

# 3. DEDUPLICATION: Fix the "Double 2017" and other duplicate issues
# This ensures we have exactly one row per neighborhood per year
df = df.sort_values(['AREA_NAME', 'YEAR'])
df = df.groupby(['AREA_NAME', 'YEAR']).first().reset_index()

# 4. FIX ZEROS: Some years have 0.0 for population/crime instead of being empty
# We replace 0.0 with NaN so we can fill them properly from other years
df = df.replace(0.0, np.nan)

# 5. SMART FILL: Fill gaps for "Static" features (Population, Parks, Transit)
# If we know the population in 2024, we fill it backwards into 2010-2023 for that neighborhood
static_cols = ['POPULATION', 'park_count', 'transit_line_density', 'total_stop_count']
df[static_cols] = df.groupby('AREA_NAME')[static_cols].ffill().bfill()

# 6. RE-CALCULATE LAGS: Now that duplicates are gone, lags will be 100% accurate
df['rent_lag_1'] = df.groupby('AREA_NAME')['avg_rent_1br'].shift(1)
df['rent_lag_2'] = df.groupby('AREA_NAME')['avg_rent_1br'].shift(2)
df['rent_growth_rate'] = (df['avg_rent_1br'] - df['rent_lag_1']) / df['rent_lag_1']

# 7. TEMPORAL TARGET ALIGNMENT: Match Year with (Year + 5)
# This fixes the issue where 2014 was trying to predict 2022 because of the 2019-2021 gap
rent_lookup = df.set_index(['AREA_NAME', 'YEAR'])['avg_rent_1br'].to_dict()

def find_target(row):
    target_year = row['YEAR'] + 5
    return rent_lookup.get((row['AREA_NAME'], target_year), np.nan)

df['TARGET_RENT_5YR'] = df.apply(find_target, axis=1)

# 8. SAVE THE MASTERPIECE
final_output_path = 'processed_data/toronto_ai_ready_data.csv'
df.to_csv(final_output_path, index=False)

print(f"Cleanup Complete! Saved to: {final_output_path}")
print(f"Total rows: {len(df)}")
print(f"Neighborhoods: {df['AREA_NAME'].nunique()}")

# Quick check on the shift for a sample neighborhood
df[df['AREA_NAME'] == df['AREA_NAME'].unique()[0]][['YEAR', 'avg_rent_1br', 'rent_lag_1', 'TARGET_RENT_5YR']].head(12)

Cleanup Complete! Saved to: processed_data/toronto_ai_ready_data.csv
Total rows: 1896
Neighborhoods: 158


Unnamed: 0,YEAR,avg_rent_1br,rent_lag_1,TARGET_RENT_5YR
0,2010,1113.333333,,1342.25
1,2011,1224.666667,1113.333333,1416.5
2,2012,1272.75,1224.666667,1570.25
3,2013,1318.25,1272.75,1737.0
4,2014,1301.5,1318.25,
5,2015,1342.25,1301.5,
6,2016,1416.5,1342.25,
7,2017,1570.25,1416.5,2139.75
8,2018,1737.0,1570.25,2380.75
9,2022,2139.75,1737.0,
