# Data from the Toronto Regional Real Estate Board 
## We need to perform some data cleaning
Let's start and read the CSV

In [42]:
import pandas as pd

trreb_df = pd.read_csv("TRREB_data_raw.csv")



## Rename columns and drop nulls

In [43]:
# Rename columns for consistency
trreb_df.columns = [col.strip().lower().replace(' ', '_') for col in trreb_df.columns]

# Handle missing values
trreb_df = trreb_df.dropna()  # Drop rows with missing values (or handle differently)

In [44]:
trreb_df.head(20)

Unnamed: 0,address,beds,baths,type,price,sqft,listing#
0,50 Orchid Place Dr Room 1 Toronto,1,1,Condo Townhouse Multi-Level,1000,1000-1199,E9416947
1,330 Village Green Sq 8 Toronto,1,2,Condo Townhouse 3-Storey,1100,1200-1399,E9399099
2,3 Falaise Rd A2 Toronto,1,1,Condo Townhouse Stacked Townhse,1200,1400-1599,E10431570
3,10 Eddystone Ave 241 Toronto,1,2,Condo Apt 2-Storey,1200,1200-1399,W10407655
4,872 Browns Line 2 Toronto,1,1,Condo Apt Multi-Level,1250,1000-1199,W10423767
5,55 Cedarcroft Blvd 12 Toronto,1,1,Condo Townhouse 2-Storey,1250,1200-1399,C10441325
6,177 Linus Rd 1014 Toronto,1,1,Condo Apt Apartment,1280,900-999,C9295556
7,350 Alton Towers Circ 907 Toronto,1+1,1,Condo Apt Apartment,1300,700-799,E9381394
8,57 Finch Ave 27 Toronto,1,1,Condo Townhouse 2-Storey,1350,900-999,C10884899
9,11753 Sheppard Ave E 420 Toronto,1,1,Condo Apt Apartment,1350,1000-1199,E10422806


## Get sqft range and average

In [45]:
# Get the average
# Split the 'sqft' column into min and max
trreb_df[['sqft_min', 'sqft_max']] = trreb_df['sqft'].str.split('-', expand=True).astype(int)

# Calculate the average
trreb_df['sqft_avg'] = (trreb_df['sqft_min'] + trreb_df['sqft_max']) / 2

In [46]:
trreb_df.head()

Unnamed: 0,address,beds,baths,type,price,sqft,listing#,sqft_min,sqft_max,sqft_avg
0,50 Orchid Place Dr Room 1 Toronto,1,1,Condo Townhouse Multi-Level,1000,1000-1199,E9416947,1000,1199,1099.5
1,330 Village Green Sq 8 Toronto,1,2,Condo Townhouse 3-Storey,1100,1200-1399,E9399099,1200,1399,1299.5
2,3 Falaise Rd A2 Toronto,1,1,Condo Townhouse Stacked Townhse,1200,1400-1599,E10431570,1400,1599,1499.5
3,10 Eddystone Ave 241 Toronto,1,2,Condo Apt 2-Storey,1200,1200-1399,W10407655,1200,1399,1299.5
4,872 Browns Line 2 Toronto,1,1,Condo Apt Multi-Level,1250,1000-1199,W10423767,1000,1199,1099.5


## Let's see what unique values we have in "type" column and convert

In [47]:
trreb_df['type'].unique()

array(['Condo Townhouse Multi-Level', 'Condo Townhouse 3-Storey',
       'Condo Townhouse Stacked Townhse', 'Condo Apt 2-Storey',
       'Condo Apt Multi-Level', 'Condo Townhouse 2-Storey',
       'Condo Apt Apartment', 'Condo Apt Bachelor/Studio',
       'Room Apartment', 'Room 3-Storey', 'Shared Room Apartment',
       'Upper Level Apartment', 'Other Apartment', 'Co-Op Apt Apartment',
       'Comm Element Condo 2-Storey', 'Co-Ownership Apt Apartment',
       'Comm Element Condo Apartment',
       'Comm Element Condo Bachelor/Studio', 'Condo Apt Loft',
       'Other Multi-Level', 'Condo Apt Other',
       'Condo Townhouse Apartment', 'Comm Element Condo Stacked Townhse',
       'Comm Element Condo Multi-Level', 'Condo Apt Stacked Townhse',
       'Condo Townhouse Loft', 'Co-Op Apt Bachelor/Studio',
       'Condo Apt Industrial Loft', 'Comm Element Condo Other',
       'Comm Element Condo Loft', 'Condo Apt 3-Storey'], dtype=object)

In [48]:
# Mapping rules for 'Type'
type_mapping = {
    'Condo Apt': 'Condo Apartment',
    'Condo Townhouse': 'Condo Townhouse',
    'Room': 'Room',
    'Shared Room': 'Shared Room',
    'Upper Level': 'Upper Level',
    'Co-Op Apt': 'Co-Op Apartment',
    'Co-Ownership Apt': 'Co-Ownership Apartment',
    'Comm Element Condo': 'Common Element Condo',
    'Other': 'Other'
}

# Extract 'Type'
trreb_df['Type'] = trreb_df['type'].apply(lambda x: next((type_mapping[key] for key in type_mapping if key in x), 'Other'))

# Mapping rules for 'Style'
style_mapping = [
    '2-Storey', '3-Storey', 'Apartment', 'Bachelor/Studio', 'Multi-Level', 
    'Stacked Townhse', 'Loft', 'Industrial Loft', 'Other'
]

# Extract 'Style'
def extract_style(value):
    for style in style_mapping:
        if style in value:
            return style
    return 'Other'

trreb_df['Style'] = trreb_df['type'].apply(extract_style)

# Display the resulting DataFrame
trreb_df.head()

Unnamed: 0,address,beds,baths,type,price,sqft,listing#,sqft_min,sqft_max,sqft_avg,Type,Style
0,50 Orchid Place Dr Room 1 Toronto,1,1,Condo Townhouse Multi-Level,1000,1000-1199,E9416947,1000,1199,1099.5,Condo Townhouse,Multi-Level
1,330 Village Green Sq 8 Toronto,1,2,Condo Townhouse 3-Storey,1100,1200-1399,E9399099,1200,1399,1299.5,Condo Townhouse,3-Storey
2,3 Falaise Rd A2 Toronto,1,1,Condo Townhouse Stacked Townhse,1200,1400-1599,E10431570,1400,1599,1499.5,Condo Townhouse,Stacked Townhse
3,10 Eddystone Ave 241 Toronto,1,2,Condo Apt 2-Storey,1200,1200-1399,W10407655,1200,1399,1299.5,Condo Apartment,2-Storey
4,872 Browns Line 2 Toronto,1,1,Condo Apt Multi-Level,1250,1000-1199,W10423767,1000,1199,1099.5,Condo Apartment,Multi-Level


## The last step is to convert from addres to geometries (lat, long)

## Finally, export the csv as a cleaned file

In [49]:
# Let's export the cleaned .csv

# file name
output_file = "TRREB_data_cleaned.csv"

# Export the DataFrame to a CSV file
trreb_df.to_csv(output_file, index=False)  # index=False prevents writing row numbers as a column

print(f"DataFrame successfully exported to {output_file}")


DataFrame successfully exported to TRREB_data_cleaned.csv


# The code that finally worked, thorugh the google API

Don't run, otherwise it will consume credits!

In [None]:
# THE ONE THAT FINALLY WORKED

# import pandas as pd
# from geopy.geocoders import GoogleV3

# # Google Maps API key
# api_key = "AIzaSyBdnqLhg1cafPyGBaNz6NYnUK8UIM3h_Jo"

# # Initialize the geolocator with Google Maps API
# geolocator = GoogleV3(api_key=api_key)

# # Load the CSV file
# input_file = "TRREB_data_cleaned.csv"  # Update with your actual file path
# output_file = "TRREB_data_with_lat_long.csv"  # Output file path
# df = pd.read_csv(input_file)

# # Initialize columns for latitude and longitude
# df['LAT'] = None
# df['LONG'] = None

# # Geocode each address
# for index, row in df.iterrows():
#     try:
#         # Geocode the address
#         location = geolocator.geocode(row['address'], timeout=10)
#         # Extract latitude and longitude
#         if location:
#             df.loc[index, 'LAT'] = location.latitude
#             df.loc[index, 'LONG'] = location.longitude
#             print(f"Geocoded: {row['address']} -> LAT: {location.latitude}, LONG: {location.longitude}")
#         else:
#             print(f"Address not found: {row['address']}")
#             df.loc[index, 'LAT'] = None
#             df.loc[index, 'LONG'] = None
#     except Exception as e:
#         print(f"Error geocoding address: {row['address']} - {e}")
#         df.loc[index, 'LAT'] = None
#         df.loc[index, 'LONG'] = None

# # Save the DataFrame with LAT and LONG columns
# df.to_csv(output_file, index=False)
# print(f"Geocoding completed. Results saved to {output_file}")
