#  Real Estate Appraisal System - Data Cleaning Notebook


--- 




## Cell 1: Import Required Libraries


In [107]:
import pandas as pd
import re  # For pattern matching in text
from dateutil import parser  # For parsing various date formats
from datetime import datetime
from rapidfuzz import process, fuzz



## Cell 2: Load Cleaned Data

We'll start by loading our cleaned subjects, comps, and candidates tables that were saved during data cleaning.

In [108]:
# Load cleaned data from CSV files
subjects_df = pd.read_csv('../data/raw/subjects_raw.csv')
comps_df = pd.read_csv('../data/raw/comps_raw.csv')
candidates_df = pd.read_csv('../data/raw/candidates_raw.csv')

print("✅ Data loaded!")
print(f"Subjects:   {len(subjects_df)}")
print(f"Comps:      {len(comps_df)}")
print(f"Candidates: {len(candidates_df)}")


✅ Data loaded!
Subjects:   88
Comps:      264
Candidates: 9820


## Cell 3 - 7: Summarize Unique Values for Selected Features

This function helps you **quickly explore all unique values** for just the columns you care about, in any DataFrame (subjects, comps, candidates).

**How it works:**
- Pass your DataFrame and a list of column names you want to inspect.
- Returns a summary table showing:
  - The feature (column) name
  - All unique values found in that column
  - The total number of unique values
- If a column isn’t found, it displays "(Column missing in DataFrame)" for clarity.

---

**Example usage:**
```python
subjects_uni_df = summarize_selected_unique_features(subjects_df, subject_cols)
comps_uni_df = summarize_selected_unique_features(comps_df, comp_cols)
candidates_uni_df = summarize_selected_unique_features(candidates_df, candidate_cols)


In [109]:
def summarize_selected_unique_features(df, columns):
    """
    Shows all unique values for selected columns in a DataFrame.
    Args:
        df (pd.DataFrame): The DataFrame to summarize.
        columns (list): List of column names to check.
    Returns:
        pd.DataFrame: Summary table of unique values per selected feature.
    """
    feature_names = []
    unique_vals = []
    num_uniques = []
    
    for col in columns:
        if col in df.columns:
            uniques = df[col].unique()
            feature_names.append(col)
            unique_vals.append(list(uniques))
            num_uniques.append(len(uniques))
        else:
            feature_names.append(col)
            unique_vals.append(["(Column missing in DataFrame)"])
            num_uniques.append(0)
    
    summary_df = pd.DataFrame({
        "Feature": feature_names,
        "Unique_Values": unique_vals,
        "Num_Unique": num_uniques
    })
    
    pd.set_option('display.max_colwidth', None)
    return summary_df


In [110]:
# List of columns you want to see (from your message above)
subject_cols = [
    "effective_date", "lot_size_sf", "structure_type", "style", "room_total",
    "num_beds", "gla", "num_baths", "condition"
]

comp_cols = [
    "distance_to_subject", "prop_type", "stories", "sale_date", "lot_size",
    "condition", "gla", "room_count", "bed_count", "bath_count"
]

candidate_cols = [
    "bedrooms", "gla", "property_sub_type", "structure_type", "style",
    "levels", "room_count", "full_baths", "half_baths", "lot_size_sf",
    "close_date", "latitude", "longitude"
]

# Run the function for each DataFrame
subjects_uni_df = summarize_selected_unique_features(subjects_df, subject_cols)
comps_uni_df = summarize_selected_unique_features(comps_df, comp_cols)
candidates_uni_df = summarize_selected_unique_features(candidates_df, candidate_cols)


In [111]:
subjects_uni_df

Unnamed: 0,Feature,Unique_Values,Num_Unique
0,effective_date,"[Apr/11/2025, Apr/17/2025, May/01/2025, Apr/15/2025, Apr/16/2025, Apr/14/2025, Apr/10/2025, Apr/09/2025, Mar/24/2025, Apr/07/2025, Apr/04/2025, Apr/08/2025, Apr/23/2025, Apr/21/2025, Apr/22/2025, Apr/24/2025, Apr/29/2025, Apr/30/2025, Apr/25/2025, Apr/28/2025, May/02/2025, May/06/2025, May/05/2025, Feb/13/2025]",24
1,lot_size_sf,"[nan, 72745+/-SqFt, 20174 SqFt, 1.25 Acres, 6825 SqFt, 3694 SqFt, 3358 SqFt, 549 SqM, 42495 SqFt, 2619 SqFt, 26700 SqFt, 5,663 SqFt, 33943+/-SqFt, 5987 SqFt, 10.057 Acres, 1742 SqFt, 3250 SqFt, 377 SqM, 431 SqM, 605 SqM, 3601 SqFt, 330 SqM, 1868 SqFt, 626 SqM, 761 SqM, SqFt, 12600 SqFt, N/A-CONDO LAND, 3253 SqFt, 33661 SqFt, 458 SqM, 37374 SqFt, 15936 SqFt, 3137 SqFt, 6300 SqFt, 14375 SqFt, 3500 SqFt, 14232 SqFt, 6000 SqFt, 911 SqFt, 8398 SqFt, 6452 SqFt, 7541 SqFt, 7,007 SqFt, 10570 SqFt, 3,685 SqFt, 6,864 SqFt, 2899 SqFt, 2780 SqFt, 1.22 Acres, 7059 SqFt, 6480 SqFt, N/A Condominium, 416 SqM, 255 SqM, 332 SqM, 381 SqM, n/a-condo land, 344 SqM, 0.5ac, 931 SqM, 5 Acres, 23046 SqFt, 10,014 SqFt, 878 SqM, 699 SqM, 4768 SqFt, 4919 sqft, 580 SqM, 5791 SqFt, 3625 SqFt, na, 759 SqM, 541 SqM, 10 Acres, 14 Acres]",76
2,structure_type,"[Townhouse, Detached, Condominium, Semi Detached, High Rise Apartment, Low Rise Apartment, Duplex, Triplex, nan, Fourplex]",10
3,style,"[2 Storey, 1.5 Storey, Bungalow, 1 Storey, 4 Level Split, 2 strorey, Bungalow Raised, 2.5 Storey, one level, 3 Plus Stories, Split Level, 3 Storey, Bi-Level]",13
4,room_total,"[6.0, 9.0, 5.0, 7.0, 10.0, 8.0, 4.0, 15.0, 12.0, nan]",10
5,num_beds,"[3, 4, 2, 5, 1, 7, 3+1, 2+2, nan, 8]",10
6,gla,"[1044, 1500 SqFt, 3000 SqFt, 1283 SqFt, 1530 SqFt, 1746 SqFt, 1602 SqFt, 1751 SqFt, 1523 SqFt, 2456 SqFt, 3332+/-SqFt, 1525 SqFt, 2895 SqFt, 1,678 SqFt, 2264 SqFt, 2530 SqFt, 2149 SqFt, 1840 SqFt, 1354 SqFt, 1573.40 SqFt, 78 SqM, 2974 SqFt, 2956 SqFt, 1109 SqFt, 1580 SqFt, 1587 SqFt, 90 SqM, 1039 SqFt, 261 SqM, 789 SqFt, 1094 SqFt, 1176 SqFt, 1334 SqFt, 2010 SqFt, 246 SqM, 1400 SqFt, 864 SqFt, 1774 SqFt, 1795 SqFt, 3137 SqFt, 1132 SqFt, 1788 SqFt, 1637 SqFt, 1822 SqFt, 1386 SqFt, 522 SqFt, 1607 SqFt, 2755 SqFt, 1634 SqFt, 1170 SqFt, 1,840 SqFt, 1845 SqFt, 1488 SqFt, 2131 SqFt, 1504 SqFt, 1643 SqFt, 1236 SqFt, 1700 SqFt, 1992 SqFt, 2242 SqFt, 758 SqFt, 2231.68 SqFt, 2115 SqFt, 1149 SqFt, 1372 SqFt, 1178 SqFt, 2203 SqFt, 1675 SqFt, 1952 SqFt, 2773 SqFt, 2304 SqFt, 1050 SqFt, 2061 SqFt, 3543.10 SqFt, 2352 SqFt, 2385 SqFt, 186 SqM, 1290 SqFt, 1451 SqFt, 678 SqFt, 665 sf, 584 sf, 1746.79 SqFt, 1144 SqFt, 2,138 SqFt, 2250 SqFt, 1254 SqFt]",87
7,num_baths,"[1:1, 2:1, 3:0, 2:0, 2F 1H, 3:1, 2:2, 1:0, 2F, 2 Full/1Half, 3F 1H, 3F, 4F, 3:2, 1F, 2, nan, 1F 1H]",18
8,condition,"[Average, Good, Excellent, Fair]",4


In [112]:
comps_uni_df

Unnamed: 0,Feature,Unique_Values,Num_Unique
0,distance_to_subject,"[0.15 KM, 0.02 KM, 0.09 KM, 3.73 KM, 8.98 KM, 17.33 KM, 0.31 KM, 0.50 KM, 23.23 KM, 0.78 KM, 0.53 KM, 0.99 KM, 0.00 KM, 1.43 KM, 4.97 KM, 0.38 KM, 1.03 KM, 0.23 KM, 0.64 KM, 0.54 KM, 1.27 KM, 3.54 KM, 3.43 KM, 6.41 KM, 1.11 KM, nan, 1.22 KM, 1.39 KM, 2.09 KM, 11.52 KM, 1.14 KM, 2.91 KM, 1.89 KM, 0.51 KM, 1.12 KM, 1.30 KM, 5.27 KM, 3.50 KM, 1.74 KM, 1.05 KM, 0.91 KM, 1.51 KM, 7.58 KM, 3.46 KM, 0.63 KM, 0.37 KM, 0.07 KM, 0.12 KM, 1.62 KM, 0.06 KM, 1.33 KM, 1.53 KM, 1.77 KM, 1.54 KM, 2.17 KM, 4.69 KM, 3.27 KM, 4.01 KM, 0.43 KM, 0.84 KM, 2.30 KM, 1.92 KM, 0.21 km, 0.67 KM, 0.81 KM, 1.57 KM, 1.61 KM, 0.77 KM, 0.88 KM, 3.01 KM, 0.61 KM, 0.24 KM, 0.17 KM, 1.18 KM, 9.03 KM, 19.97 KM, 0.34 KM, 0.05 KM, .05 km, 2.2 km, 1.7 km, 2.86 KM, 14.66 KM, 2.70 KM, 2.95 KM, 7.38 KM, 9.73 KM, 9.68 KM, 0.93 KM, 9.66 KM, 9.79 KM, 0.92 KM, 1.90 KM, 0.28 KM, 0.22 KM, 0.79 KM, 0.89 KM, 0.48 KM, 0.16 KM, 0.32 KM, ...]",175
1,prop_type,"[Townhouse, Detached, Condominium, Semi Detached, High Rise Apartment, Low Rise Apartment, Duplex, Triplex, nan, Fourplex]",10
2,stories,"[2 Storey, 1 Storey, 1.5 Storey, Bungalow, 4 Level Split, 3 Level Split, Split Level, 2-Storey, Bungalow Raised, 2 1/2 Storey, Bi-Level, Apartment-Low-Rise (, 3 Storey, 2.5 Storey]",14
3,sale_date,"[Oct/25/2024, Feb/05/2025, Feb/14/2025, Jan/16/2025, Dec/21/2024, Nov/23/2024, Mar/26/2025, Dec/24/2024, Apr/16/2025, Feb/20/2025, Oct/11/2024, Feb/17/2025, Apr/15/2025, Feb/18/2025, Dec/13/2024, Apr/14/2025, Nov/18/2024, Mar/31/2025, Mar/23/2025, Dec/05/2024, Dec/09/2024, Mar/29/2025, Feb/24/2025, Nov/21/2024, Oct/18/2024, Apr/07/2025, Feb/08/2025, Jan/27/2025, Jan/25/2025, Dec/19/2024, Jun/28/2024, Mar/09/2025, Apr/02/2025, Mar/04/2025, Feb/21/2025, Mar/21/2025, Sep/20/2024, Mar/19/2025, Nov/16/2024, Nov/20/2024, Jan/31/2025, Mar/28/2025, Dec/03/2024, Sep/06/2024, Jan/20/2025, Mar/27/2025, Dec/17/2024, Mar/20/2025, Feb/12/2025, Mar/10/2025, Nov/22/2024, Jan/24/2025, Dec/26/2024, Nov/01/2024, Aug/25/2024, Feb/07/2025, Mar/08/2025, Oct/12/2024, Feb/27/2025, Feb/04/2025, Feb/16/2025, Feb/26/2025, Jan/03/2025, Jan/06/2025, Apr/04/2025, Mar/11/2025, Mar/12/2025, Mar/18/2025, Apr/03/2025, Feb/13/2025, Mar/15/2025, Jan/10/2025, Mar/06/2025, Sep/25/2024, Mar/24/2025, Jan/17/2025, Sep/23/2024, Feb/06/2025, Dec/31/2024, Mar/16/2025, Dec/20/2024, Sep/21/2024, Feb/19/2025, May/18/2024, Jan/13/2025, Oct/16/2024, Mar/25/2025, Aug/14/2024, Jul/15/2025, Jul/18/2024, Apr/09/2025, Mar/05/2025, Dec/12/2024, Feb/28/2025, Nov/08/2024, Jan/30/2025, Jan/22/2025, Dec/16/2024, Oct/21/2024, Oct/29/2024, ...]",133
4,lot_size,"[N/A Condominium, 80212+/-SqFt, 61700+/-SqFt, 48200+/-SqFt, 15116 sqft, 11138 sqft, 29200 sqft, 390x195 / 1.85 Acres, 225x217 /1.24 Acres, 190x403 / 2 Acres, CONDO - N/A, 7726+/-SqFt, 7623+/-SqFt, 9110+/-SqFt, 2604 SqFt, 3348 SqFt, 1936 SqFt, 35Fx86F/ 2992 SqFt, 25Fx100F/ 2893 SqFt, 55x126/4230 SqFt, N/A - Condo, n/a x n/a 500 SQ M, 486 SQ M, 459 Sq M, 169884+/-SqFt, 24650+/-SqFt, 59396+/-SqFt, 2433 SqFt, 2647 SqFt, 2680 SqFt, 131115+/-SqFt, 11029+/-SqFt, 18036+/-SqFt, 49' x 119' / 5,831 sf, 50' x 118' / 5,900 sf, 60' x 110' / 6,600 sf, 52500+/-SqFt, 50853+/-SqFt, 82700+/-SqFt, 50.15 x 123 / 6196 SqFt, 58.12 x 98.32 / 5600 SqFt, 49.98 x 116.28 / 5823 SqFt, 10.029 Acres, 1.22 Acres, 1.47 acres, 19' x 106' / 2014 SqFt, 17' x 105' / 1785 SqFt, 17' x 58' / 986 SqFt, 3400 SqFt, 3719 SqFt, 3148 SqFt, 285 SqM, 282 SqM, 314 SqM, 492 SqM, 503 SqM, 501 SqM, 514 SqM, 430 SqM, 675 SqM, 47Fx116F/5541 SqFt, 42Fx94F/ 4085 SqFt, 42Fx95F/4230 SqFt, Common Property, 253 SqM, 348 SqM, 406 SqM, 24.88 x 69 / 1722 SqFt, 20.96 x 92.49 / 1941 SqFt, 22.44 x 125.92 / 2417 SqFt, 557 SqM, 677 SqM, 512 SqM, 546 SqM, 51836+/- SqFt, 67188+/- SqFt, 93131+/- SqFt, Condo Property, 41' x 98' / 4014 SqFt, 28' x 172' / 4485 SqFt, 28' x 113' / 4842 SqFt, 28176+/-SqFt, 21446+/-SqFt, 24343+/-SqFt, 390 SqM, 384 SqM, 333 SqM, 40000+/-SqFt, 32393+/-SqFt, 48085+/-SqFt, 8600+/-SqFt, 21730+/-SqFt, 67082+/-SqFt, 37 x 74 / 2841 SqFt, 35 x 111.54 / 3907 SqFt, 35.17 X 111.97 / 3984 SqFt, 22489+/-SqFt, 7000+/-SqFt, 14738+/-SqFt, 3796+/-SqFt, ...]",216
5,condition,"[Superior, Inferior, Similar, Good, Average, Excellent]",6
6,gla,"[1044 SqFt, 1602+/-SqFt, 2100+/-SqFt, 1662+/-SqFt, 2003 SqFt, 2940 SqFt, 2365 SqFt, 1571 SqFt, 1610 SqFt, 1976 SqFt, 1048 SqFt, 1553 SqFt, 1483 SqFt, 1688+/-SqFt, 2298+/-SqFt, 1306 SqFt, 1334 SqFt, 1120 SqFt, 1802 SqFt, 2131 SqFt, 1758 SqFt, 1816+/-SqFt, 2206+/-SqFt, 1342+/-SqFt, 2367 SqFt, 2614 SqFt, 2608 SqFt, 2555+/-SqFt, 1829+/-SqFt, 3024+/-SqFt, 1695 SqFt, 1572 SqFt, 1489 SqFt, 2354+/-SqFt, 2494+/-SqFt, 3774+/-SqFt, 1,805 SqFt, 1,741 SqFt, 2,094 SqFt, 3167+/-SqFt, 2343+/-SqFt, 1664+/-SqFt, 3574 SqFt, 3756 SqFt, 2442 SqFt, 2017 SqFt, 2133 SqFt, 2676 SqFt, 2250 +/- SqFt, 1750 +/- SqFt, 1200 SqFt, 1540 SqFt, 1320 SqFt, 1808 SqFt, 1801 SqFt, 1640 SqFt, 94 SqM, 78 SqM, 2770 SqFt, 2556 SqFt, 3364 SqFt, 2596 SqFt, 3351 SqFt, 3071 SqFt, 965 SqFt, 980.69 SqFt, 1037.62 SqFt, 1425 SqFt, 1580 SqFt, 1440 SqFt, 1569 SqFt, 1740 SqFt, 1515 SqFt, 99 SqM, 58 SqM, 88 SqM, 1130 SqFt, 950 SqFt, 1060.09 SqFt, 285 SqM, 240 SqM, 230 SqM, 934, 881, 875 SqFt, 1288+/-SqFt, 1211+/-SqFt, 1085+/-SqFt, 1340 SqFt, 853SqFt, 1043 SqFt, 1470 SqFt, 1465 SqFt, 1475 SqFt, 1693+/-SqFt, 1100+/-SqFt, 2402+/-SqFt, 233 SqM, 248 SqM, 224 SqM, ...]",247
7,room_count,"[6, 5, 8, 7, 10, 9, 11, 17, 6+3, 12+4, 4, 14+6, 5+4, 8+2, 12, 15, 8+5, 14, 7+3, 13]",20
8,bed_count,"[3, 2, 4, 5, 1, 2+2, 3+1, 6, 2 + 1, 2+ 1, 2+1, 4+3, 8, 7]",14
9,bath_count,"[2:0, 1:0, 2F1P, 3F1P, 2:1, 4:0, 1:1, 2F1H, 3F, 4F, 2F 1F, 3:0, 2:2, 3:1, 4:1, 2F 1H, 2F, 1F, 2 Full/1Half, 3F 1H, 2, 3F1H, 5:0, 2F 2H, 4F 1H, 1F 1H, 2f 1H, 4F 2H, 1F1P]",29


In [113]:
candidates_uni_df

Unnamed: 0,Feature,Unique_Values,Num_Unique
0,bedrooms,"[3.0, 4.0, 5.0, 2.0, 1.0, 0.0, 6.0, nan, 7.0, 8.0, 9.0]",11
1,gla,"[1500.0, 1750.0, 1300.0, nan, 1250.0, 1880.0, 1817.0, 2000.0, 2250.0, 1550.0, 1100.0, 1564.0, 900.0, 1604.0, 1850.0, 1252.0, 1259.0, 2700.0, 1721.0, 2750.0, 1820.0, 1797.0, 1501.0, 1616.0, 1506.0, 1596.0, 550.0, 2470.0, 2482.0, 1156.0, 1493.0, 1504.0, 1119.0, 1335.0, 250.0, 2814.0, 2724.0, 1273.0, 850.0, 1900.0, 950.0, 1136.0, 1026.0, 2117.0, 816.0, 1350.0, 1320.0, 1092.0, 1078.0, 616.0, 1182.0, 320.0, 1130.0, 1185.0, 440.0, 3675.0, 1472.0, 1152.0, 1444.0, 2377.0, 2093.0, 2801.0, 1440.0, 1807.0, 1479.0, 2103.0, 1352.0, 2170.0, 628.0, 2001.0, 1831.0, 1606.0, 1274.0, 925.0, 1355.0, 1278.0, 2450.0, 3247.0, 1840.0, 1684.0, 1491.0, 2225.0, 1225.0, 2188.0, 2122.0, 1570.0, 2575.0, 2095.0, 1709.0, 1452.0, 1727.0, 977.0, 895.0, 2509.0, 1600.0, 782.0, 1121.0, 1715.0, 2496.0, 2164.0, ...]",2079
2,property_sub_type,"[Detached, Freehold Townhouse, Rural Resid, Condo Apt, Condo Apartment, Semi-Detached, Condo Townhouse, Single Family Residence, nan, Common Element Condo, Duplex, Over-Under, Row/Townhouse, Detached, Single Family Residence, Semi Detached, Single Family Residence, Condo/Apt Unit, Link, Other, Vacant Land, Townhouse, Row Unit, 3 Storey, MobileTrailer, Semi Detached (Half Duplex), Residential Land, Stacked, Row Unit, 2 Storey, Multiplex, Apartment, 4 plex, Detached Single Family, Single Family, Residential, Row Unit, Full Duplex, Mobile, Locker, Rural Residential, Fourplex, Triplex, Duplex Up/Down, Detached, Freehold, Detached, Agriculture, Mobile Home, Farm]",44
3,structure_type,"[Detached, Detached, 2-Storey , Freehold Townhouse, Freehold Townhouse, 2-Storey , Rural Resid, Detached, Sidesplit 3 , Detached, Sidesplit 4 , Detached, Bungalow , Apartment, Condo Apartment, Apartment , Detached, Bungalow-Raised , Semi-Detached, Semi-Detached, 2-Storey , Condo Townhouse, 2-Storey , Single Family Residence, Sidesplit , Detached, Backsplit 4 , Condo Apt, Condo Townhouse, Detached, Backsplit 3 , Semi-Detached, Bungalow , Semi-Detached, Bungalow-Raised , Condo Apartment, Bachelor/Studio , Condo Townhouse, 3-Storey , nan, Semi-Detached, Backsplit 3 , Condo Apartment, 2-Storey , Detached, Sidesplit , Common Element Condo, Apartment , Detached, 1 1/2 Storey , Detached, Other , Duplex, Bungalow , Single Family, Duplex, Row/Townhouse, Two Story , Single Family Residence, Two Story , Single Family Residence, 3 Storey , Row/Townhouse, Stacked Townhouse , Single Family Residence, Backsplit , Row/Townhouse, 3 Storey , Detached, Single Family Residence, Bungalow , Row/Townhouse, Bungalow , Single Family Residence, Bungalow , Freehold Townhouse, 3-Storey , Semi Detached, Single Family Residence, Bungalow , Freehold Townhouse, Bungalow , Single Family Residence, 1.5 Storey , Detached, Bungaloft , Condo/Apt Unit, Two Story , Single Family Residence, 2.5 Storey , Condo Townhouse, Stacked Townhouse , Semi Detached, Single Family Residence, Two Story , Condo/Apt Unit, 1 Storey/Apt , Single Family Residence, Bungalow Raised , Link, 2-Storey , Condo Townhouse, 1 Storey/Apt , Other, 2-Storey , Condo Townhouse, Bungalow , Detached, 2 Storey , Detached, Bungalow(1 Storey) , Bungalow, Detached, 3-Storey , Vacant Land , Detached, 1 Storey/Apt , Condominium, Triplex, Condo Apt, Apartment , Row Unit, 3 Storey, 3 Storey , MobileTrailer, Bungalow , Fourplex, Semi Detached (Half Duplex), Row/Townhouse, Residential Land, Stacked Townhse, Stacked, 2 Storey , Condo Apartment, 3-Storey , Row Unit, 2 Storey, 2 Storey , Condo Townhouse, Other , Condo Townhouse, Apartment , Condo Townhouse, Bungaloft , Semi-Detached, 1 1/2 Storey , Freehold Townhouse, 2 1/2 Storey , Multiplex, Detached, 2 1/2 Storey , Semi-Detached, 2 1/2 Storey , Multiplex, 3-Storey , Link, 2 1/2 Storey , Duplex, 2-Storey , Duplex, 3-Storey , Condo Townhouse, Multi-Level , 4 plex, Detached Single Family, Residential, Link, Row Unit, Condo Apartment, 1 Storey/Apt , Full Duplex, Mobile, Condo Apartment, Loft , Condo Apartment, Multi-Level , Condo Apt, Loft , ...]",128
4,style,"[Brick, Vinyl Siding, 2-Storey , 2-Storey, Two, Sidesplit 3 , Sidesplit 4, Sidesplit 4 , Bungalow, Bungalow , nan, Apartment , Bungalow-Raised, Bungalow-Raised , Vinyl Siding, Brick, Backsplit 4, Sidesplit , Backsplit 4 , Backsplit 3 , Bachelor/Studio , 3-Storey , 1 1/2 Storey , Other , Bungalow, 1 Level, Bungalow, 2 Level, 2 Level, Split Entry, 1.5 Level, Detached, Side Split, 3 Level, Two Story , 3 Storey , Stacked Townhouse , Backsplit , 1.5 Storey , Bungaloft , 2.5 Storey , 1 Storey/Apt , Bungalow Raised , 2 Storey , Bungalow(1 Storey) , One, Stone, Wood, Other, Other, Vinyl Siding, Sidesplit 3, 1 Level, Side Split, 4 Level, 2 Level, 3 Level, Multiplex, Townhouse, 3 Level, Split Entry, Townhouse, 2 Level, 3 Level, 3 Level Side Split, Contemporary, Back Split, 2.5 Level, 3 Level, 3 Level, 2 Storey, Attached-Si , 2 Storey Split , 1 and Half Storey , 3 (or more) Storey , Bi-Level , Side Split, 1.5 Level, 5 Level, Multiplex, 5 Level, Cape Cod, 1 3/4 Storey, 3-Storey, Stacked Townhse , 1 1/2 Storey, Apartment, 2 1/2 Storey, 2 1/2 Storey , Multi-Level , Split Entry, 2 Level, Apartment-Low-Rise ( , Apartment-High-Rise , Townhouse , Apartment-Single Lev , Bungalow, Attached-Si , Bi-Level, Attached-Si , 4 Level Split , 2 Storey, Side by Sid , 4 Level Split, Attach , 5 Level Split , 3 (or more) Storey, A , Apartment-Multi Leve , Attached-Up/Down, Bun , Townhouse-Stacked , Single Level Apartment , Condo Apt, 2 Storey, High-Rise (5+) , Bungalow, Attached-Up , Attached-Side by Sid , Apartment-Penthouse , Apartment-Loft/Bache , Multi Level Unit , ...]",136
5,levels,"[Two, 2-Storey , nan, Sidesplit 3 , Sidesplit 4 , One, Bungalow , Apartment , Bungalow-Raised , 3.0, Sidesplit , Backsplit 4 , Backsplit 3 , Bachelor/Studio , 3-Storey , 1 1/2 Storey , Other , Bungalow, Bungalow, 1 Level, Bungalow, 2 Level, 2 Level, Split Entry, 1.5 Level, Detached, Side Split, 3 Level, Two Story , 3 Storey , Stacked Townhouse , Backsplit , 1.5 Storey , Bungaloft , 2.5 Storey , 1 Storey/Apt , Bungalow Raised , 2 Storey , Bungalow(1 Storey) , 2, 2-Storey, 1 Level, Side Split, 4 Level, 2 Level, 3 Level, Multiplex, Townhouse, 3 Level, Split Entry, Townhouse, 2 Level, 3 Level, Other, 3 Level Side Split, Contemporary, Back Split, 2.5 Level, 3 Level, 3 Level, 2 Storey, Attached-Si , 2 Storey Split , 1 and Half Storey , 3 (or more) Storey , Bi-Level , Side Split, 1.5 Level, 5 Level, Multiplex, 5 Level, Cape Cod, 1 3/4 Storey, 3, Three Or More, Stacked Townhse , 2 1/2 Storey , Multi-Level , Split Entry, 2 Level, Apartment-Low-Rise ( , Apartment-High-Rise , Townhouse , Apartment-Single Lev , Bungalow, Attached-Si , Bi-Level, Attached-Si , 4 Level Split , 2 Storey, Side by Sid , 4 Level Split, Attach , 5 Level Split , 3 (or more) Storey, A , Apartment-Multi Leve , Attached-Up/Down, Bun , Townhouse-Stacked , Single Level Apartment , High-Rise (5+) , Bungalow, Attached-Up , Attached-Side by Sid , Apartment-Penthouse , Apartment-Loft/Bache , Multi Level Unit , Bi-Level, Attached-Up , Mobile Home-Double W , Mobile Home-Single W , Attached-Up/Down, Bi- , Attached-Up/Down, Tow , Loft , Mobile / Mini, Single Wide, 2 Level, 5 Level, Back Split, 4 Level, Townhouse, 2 Level, Multiplex, Other, ...]",122
6,room_count,"[11.0, 13.0, 7.0, 12.0, 6.0, 9.0, 10.0, 8.0, 5.0, 14.0, 3.0, 16.0, 18.0, 2.0, 4.0, 17.0, 21.0, 20.0, 15.0, 19.0, 22.0, nan, 1.0, 24.0, 23.0, 25.0, 26.0, 0.0]",28
7,full_baths,"[3.0, nan, 4.0, 2.0, 1.0, 5.0, 0.0, 10.0, 6.0]",9
8,half_baths,"[nan, 0.0, 1.0, 2.0, 3.0]",5
9,lot_size_sf,"[3555.5, 3535.0, 2622.0, 16672.0, 6600.0, 7150.0, 6936.0, 4038.0, 4028.0, 2099.98, nan, 2080.0, 8250.0, 5650.0, 5635.0, 3845.06, 3815.0, 3775.5, 3741.0, 7800.0, 7735.0, 7050.0, 5929.0, 7344.0, 6000.0, 4867.56, 3650.0, 4847.0, 3640.0, 6299.39, 5400.0, 6279.0, 6900.0, 7500.0, 6500.0, 6987.0, 9000.0, 3480.0, 9200.0, 7590.0, 4888.0, 3132.0, 3286.0, 4978.0, 4240.0, 6786.0, 6120.0, 20000.0, 3850.0, 4290.0, 5900.0, 9075.0, 9360.0, 4305.0, 65340.0, 43560.0, 99929.0, 89298.0, 30719.0, 92347.0, 29832.0, 21446.0, 32400.0, 34000.0, 32393.0, 17238.0, 74381.0, 35810.0, 110868.0, 126952.0, 52272.0, 1848.0, 3348.0, 3900.0, 3600.0, 4000.0, 2380.0, 3000.0, 1908.0, 3248.0, 1998.0, 9432.0, 2912.0, 3120.0, 5117.0, 2964.0, 1995.0, 1886.0, 5550.0, 6486.0, 2496.0, 2052.0, 10780.0, 2750.0, 4350.0, 2900.0, 3500.0, 2600.0, 2450.0, 1960.0, ...]",2114


## Cell 8: Standardize Dates to YYYY-MM-DD Format

This function converts date strings from different formats (like `"Apr/11/2025"` or `"2025-01-13"`) into a standard format: **YYYY-MM-DD**.

**Key points:**
- Handles both `"Apr/11/2025"` and `"2025-01-13"` formats.
- Returns `None` for missing or invalid dates.
- Prints a warning if a date can't be parsed (helps you debug messy data).

---
**Example usage:**
```python
test_dates = ['Apr/11/2025', 'Oct/25/2024', '2025-01-13', None, 'bad_date']
for date in test_dates:
    print(f"{date} → {standardize_date(date)}")


In [114]:
def clean_date(date_string):
    """
    Cleans date strings and converts them to a standard format.
    
    Examples:
    - 'Apr/11/2025' → '2025-04-11'
    - 'Oct/25/2024' → '2024-10-25'
    - nan → None
    
    Parameters:
    - date_string: The date value to clean (could be string or NaN)
    
    Returns:
    - Standardized date string or None
    """
    # Check if the value is missing (NaN or None)
    if pd.isna(date_string) or date_string is None:
        return None
    
    try:
        # Use dateutil parser to handle various formats automatically
        parsed_date = parser.parse(str(date_string))
        # Return in standard YYYY-MM-DD format
        return parsed_date.strftime('%Y-%m-%d')
    except:
        # If parsing fails, return None
        print(f"Could not parse date: {date_string}")
        return None

# Test the function
test_dates = ['Apr/11/2025', 'Oct/25/2024', '2025-01-13', None]
for date in test_dates:
    print(f"{date} → {clean_date(date)}")

Apr/11/2025 → 2025-04-11
Oct/25/2024 → 2024-10-25
2025-01-13 → 2025-01-13
None → None


## Cell 9: Clean Area/Numeric Strings with Units

This function **extracts and standardizes numeric area values** from messy strings with units.  
It's designed to handle values like `"1500 SqFt"`, `"78 SqM"`, `"1.25 Acres"`, dimension strings like `"49' x 119'"`, and even hybrid formats like `"50' x 118' / 5,900 sf"`.

**How it works:**
- **If both a dimension and an explicit area are present**, it uses the area (e.g., `"50' x 118' / 5,900 sf"` returns `5900`).
- Handles common area units: **SqFt, SqM, Acres** (all returned in square feet).
- Removes commas, plus/minus signs, and ignores N/A values.
- For plain numbers (no unit), assumes square feet.
- Returns `None` if the value is missing or can't be parsed.
---
**Example usage:**
```python
test_values = [
    "1500 SqFt", "78 SqM", "1.25 Acres", "n/a", "3,555.5",
    "50' x 118' / 5,900 sf", "49' x 119'", "60' x 110' / 6,600 sf", "82700", "58 SqM"
]
for val in test_values:
    print(f"{val} → {clean_numeric_with_units(val)}")


In [115]:
def clean_numeric_with_units(value):
    """
    Extracts numeric values from strings with units and returns area in sqft.
    - If both a dimension and explicit area are given, prefers the area (e.g., '50\' x 118\' / 5,900 sf' returns 5900).
    - Handles 'SqFt', 'SqM', 'Acre', dimension strings, commas, +/-.
    - Returns float in sqft or None.
    """
    if pd.isna(value) or value is None:
        return None

    value_str = str(value).lower().strip()
    if value_str in ['n/a', 'na', 'nan', '', 'n/a condominium', 'n/a-condo land']:
        return None
    value_str = value_str.replace(',', '').replace('+/-', '')

    # First, check for explicit numeric area in the string (e.g., '/ 5900 sf')
    area_match = re.findall(r'(\d+\.?\d*)\s*(?:sq ?ft|sf|sq\.? ?ft|s\.?f\.?)', value_str)
    if area_match:
        # If multiple, get the biggest (more likely to be the correct one)
        return float(max([float(x) for x in area_match]))

    # If not, check for dimension: e.g., 50' x 118'
    dim_match = re.search(r'(\d+\.?\d*)\s*[\'ft]?\s*[x×]\s*(\d+\.?\d*)\s*[\'ft]?', value_str)
    if dim_match:
        width = float(dim_match.group(1))
        length = float(dim_match.group(2))
        return width * length

    # Check for acres
    if 'acre' in value_str or 'ac' in value_str:
        acre_match = re.search(r'(\d+\.?\d*)', value_str)
        if acre_match:
            return float(acre_match.group(1)) * 43560

    # Check for sqm
    if 'sqm' in value_str or 'sq m' in value_str:
        sqm_match = re.search(r'(\d+\.?\d*)', value_str)
        if sqm_match:
            return float(sqm_match.group(1)) * 10.764

    # Check for plain number (assume sqft)
    number_match = re.search(r'(\d+\.?\d*)', value_str)
    if number_match:
        return float(number_match.group(1))

    return None

# Test
test_values = [
    "1500 SqFt", "78 SqM", "1.25 Acres", "n/a", "3,555.5",
    "50' x 118' / 5,900 sf", "49' x 119'", "60' x 110' / 6,600 sf", "486 SQ M", "0.5ac"
]
for val in test_values:
    print(f"{val} → {clean_numeric_with_units(val)}")


1500 SqFt → 1500.0
78 SqM → 839.592
1.25 Acres → 54450.0
n/a → None
3,555.5 → 3555.5
50' x 118' / 5,900 sf → 5900.0
49' x 119' → 5831.0
60' x 110' / 6,600 sf → 6600.0
486 SQ M → 5231.304
0.5ac → 21780.0


## Cell 10: Clean and Standardize Room Count Values

This function, `clean_room_count`, cleans up room count strings and converts them to a single float value.

**How it works:**
- Handles simple numbers (`'6' → 6.0`)
- Handles composite counts (`'6+3' → 9.0`)
- Ignores missing, empty, or N/A values
- Returns `None` if the value can't be converted
---
**Example usage:**
```python
test_rooms = ['6', '6+3', '12+4', '15', None, 'n/a', '8+2']
for room in test_rooms:
    print(f"{room} → {clean_room_count(room)}")


In [116]:
def clean_room_count(value):
    """
    Cleans room count values.

    Examples:
    - '6' → 6.0
    - '6+3' → 9.0 (e.g., main rooms + den/bonus)
    - '8+2' → 10.0
    - nan, 'n/a', '' → None

    Parameters:
    - value: The room count value

    Returns:
    - Float total room count or None
    """
    if pd.isna(value) or value is None:
        return None

    value_str = str(value).strip()

    # Check for empty or n/a values
    if value_str.lower() in ['n/a', 'na', '', 'nan']:
        return None

    # Handle "X+Y" format (e.g., "6+3")
    if '+' in value_str:
        parts = value_str.split('+')
        try:
            total = sum(float(part.strip()) for part in parts)
            return total
        except:
            return None

    # Try to convert directly to float
    try:
        return float(value_str)
    except:
        return None

# Test the function
test_rooms = ['6', '6+3', '12+4', '15', None, 'n/a', '8+2']
for room in test_rooms:
    print(f"{room} → {clean_room_count(room)}")

6 → 6.0
6+3 → 9.0
12+4 → 16.0
15 → 15.0
None → None
n/a → None
8+2 → 10.0


## Cell 11: Clean and Standardize Bathroom Count Values

The `clean_bathroom_count` function standardizes various messy bathroom count formats and converts them to a single float value (where half baths count as 0.5).

**How it works:**
- Supports formats like:
  - `'2:0'` or `'2:1'` (full:half bath notation)
  - `'2F 1H'` or `'3F'` (X full, Y half notation)
  - `'2 Full/1Half'` or similar (case-insensitive)
  - Simple numbers like `'3'`
- Missing/empty/N/A values return `None`
- Returns a **float** with half baths counted as `0.5` (e.g., `2:1` → `2.5`)
---
**Example usage:**
```python
test_bathrooms = ['2:0', '2:1', '2F 1H', '3F', '1:2', '2 Full/1Half', '3', None]
for bath in test_bathrooms:
    print(f"{bath} → {clean_bathroom_count(bath)}")


In [117]:
def clean_bathroom_count(value):
    """
    Cleans bathroom count values with various formats.
    
    Examples:
    - '2:0' → 2.0 (2 full, 0 half)
    - '2:1' → 2.5 (2 full, 1 half)
    - '2F 1H' → 2.5 (2 full, 1 half)
    - '3F' → 3.0 (3 full)
    - '1:1' → 1.5
    
    Parameters:
    - value: The bathroom count value
    
    Returns:
    - Float number of bathrooms (half baths count as 0.5) or None
    """
    if pd.isna(value) or value is None:
        return None
    
    value_str = str(value).upper().strip()

    value_str = value_str.replace('P', 'H')
    
    # Check for empty or n/a values
    if value_str.lower() in ['n/a', 'na', '', 'nan']:
        return None
    
    # Handle "X:Y" format (full:half)
    if ':' in value_str:
        parts = value_str.split(':')
        try:
            full = float(parts[0].strip())
            half = float(parts[1].strip()) if len(parts) > 1 else 0
            return full + (half * 0.5)
        except:
            return None
    
    # Handle "XF YH" format (X full, Y half)
    if 'F' in value_str or 'H' in value_str:
        full_match = re.search(r'(\d+)\s*F', value_str)
        half_match = re.search(r'(\d+)\s*H', value_str)
        
        full = float(full_match.group(1)) if full_match else 0
        half = float(half_match.group(1)) if half_match else 0
        
        return full + (half * 0.5)
    
    # Handle "X Full/Y Half" format
    if 'FULL' in value_str or 'HALF' in value_str:
        full_match = re.search(r'(\d+)\s*FULL', value_str)
        half_match = re.search(r'(\d+)\s*HALF', value_str)
        
        full = float(full_match.group(1)) if full_match else 0
        half = float(half_match.group(1)) if half_match else 0
        
        return full + (half * 0.5)
    
    # Try direct conversion
    try:
        return float(value_str)
    except:
        return None

# Test the function
test_bathrooms = ['2:0', '2:1', '2F 1H', '3F', '1:2', '2F1P', '3', None]
for bath in test_bathrooms:
    print(f"{bath} → {clean_bathroom_count(bath)}")

2:0 → 2.0
2:1 → 2.5
2F 1H → 2.5
3F → 3.0
1:2 → 2.0
2F1P → 2.5
3 → 3.0
None → None


## Cell 12: Clean Distance Values

This function converts distance strings to a float (in kilometers).

- Strips units and spaces (e.g., "0.15 KM" → 0.15).
- Returns `None` if value is missing or cannot be converted.

**Example usage:**
```python
clean_distance('0.21 km')   # Returns: 0.21
clean_distance('.05 km')    # Returns: 0.05
clean_distance(None)        # Returns: None


In [118]:
def clean_distance(value):
    """
    Cleans distance values and converts to kilometers.
    
    Examples:
    - '0.15 KM' → 0.15
    - '0.21 km' → 0.21
    - '.05 km' → 0.05
    
    Parameters:
    - value: The distance value
    
    Returns:
    - Float distance in kilometers or None
    """
    if pd.isna(value) or value is None:
        return None
    
    value_str = str(value).lower().strip()
    
    # Remove 'km' and spaces
    value_str = value_str.replace('km', '').strip()
    
    try:
        return float(value_str)
    except:
        return None

# Test the function
test_distances = ['0.15 KM', '0.21 km', '.05 km', None]
for dist in test_distances:
    print(f"{dist} → {clean_distance(dist)}")

0.15 KM → 0.15
0.21 km → 0.21
.05 km → 0.05
None → None


## Cell 13: Property Type Mapping

This dictionary maps messy or alternate property type names to our standard set of property types (`CANONICAL_TYPES`).  
Any value not in this list will be set to `None`.

In [119]:
manual_map = {
    # Detached houses
    "detached": "Detached",
    "detached single family": "Detached",
    "single family": "Detached",
    "single family residence": "Detached",
    "rural resid": "Detached",
    "rural residential": "Detached",
    "agriculture": "Detached",
    "farm": "Detached",
    "freehold": "Detached",
    "mobile": "Detached",
    "mobile home": "Detached",
    "mobiletrailer": "Detached",
    "mobile trailer": "Detached",

    # Semi-detached and Link homes
    "semi-detached": "Semi Detached",
    "semi detached": "Semi Detached",
    "semi detached (half duplex)": "Semi Detached",
    "link": "Semi Detached",

    # Townhouses and similar
    "townhouse": "Townhouse",
    "freehold townhouse": "Townhouse",
    "row/townhouse": "Townhouse",
    "row townhouse": "Townhouse",
    "row unit": "Townhouse",
    "row unit 2 storey": "Townhouse",
    "row unit 3 storey": "Townhouse",
    "stacked": "Townhouse",
    "stacked townhouse": "Townhouse",

    # Condos
    "Low Rise Apartment": "Condominium",
    "High Rise Apartment": "Condominium",
    "condo apt": "Condominium",
    "condo townhouse": "Condominium",
    "condo apartment": "Condominium",
    "condo/apt unit": "Condominium",
    "condo unit": "Condominium",
    "common element condo": "Condominium",
    "apartment": "Condominium",

    # Duplexes, Triplexes, Fourplexes
    "duplex": "Duplex",
    "over-under": "Duplex",
    "over under": "Duplex",
    "duplex up/down": "Duplex",
    "full duplex": "Duplex",
    "triplex": "Triplex",
    "fourplex": "Fourplex",
    "4 plex": "Fourplex",
}

CANONICAL_TYPES = [
    "Townhouse", "Detached", "Condominium", "Semi Detached",
    "High Rise Apartment", "Low Rise Apartment", "Duplex", "Triplex", "Fourplex"
]

## Cell 14: Rapid Fuzzy Function

This function maps a property type string to a standardized value using our manual mapping:

- Checks for an **exact match** in the mapping dictionary.
- If no exact match, uses **fuzzy string matching** (`rapidfuzz`) to find the closest match above the `score_cutoff` threshold.
- Returns the standardized type if found, otherwise returns `None`.

---
#### **Example Usage**
Suppose you have some messy property type values in your dataset:

```python
values = ["detachd", "row unit", "condo aprtment", "semi det", "moblie home"]

detachd         -> Detached
row unit        -> Townhouse
condo aprtment  -> Condominium
semi det        -> Semi Detached
moblie home     -> Detached



In [120]:
def get_manual_type_fuzzy(value, mapping, score_cutoff=80):
    """
    Returns mapped type if a close match exists in the mapping dict, else None.
    Uses rapidfuzz for fuzzy string matching.
    """
    if not value:
        return None
    value = str(value).lower().strip()
    # Exact match
    if value in mapping:
        return mapping[value]
    # Fuzzy match
    best, score, _ = process.extractOne(
        value, list(mapping.keys()), scorer=fuzz.ratio
    )
    if score >= score_cutoff:
        return mapping[best]
    return None

## Cell 15: Cononical check function 


In [121]:
def canonical_check(mapped_type):
    """Return mapped_type if in canonical list, else None"""
    if mapped_type in CANONICAL_TYPES:
        return mapped_type
    return None

## Cell 16: Property Type Standardization

The `standardize_property_type` function combines `property_sub_type` and `structure_type` from each property record and returns a single, standardized property type.

#### How it works:
- **Prefers `property_sub_type`** if available, otherwise uses `structure_type`.
- **Manual & fuzzy matching:** First tries to map the value to a standard type using our manual dictionary (with fuzzy matching for typos and close matches).
- **Canonical check:** Only returns the type if it’s in the approved `CANONICAL_TYPES` list.
- **Substring & keyword logic:** For complex or combined values, it looks for canonical type substrings or key terms (e.g., "condo", "duplex") to determine the best match.
- **Returns `None`** if the value cannot be matched to a canonical property type.

This ensures all property types are consistently categorized for analysis and modeling.


In [122]:
def standardize_property_type(row):
    """
    Combines property_sub_type and structure_type intelligently.
    Always returns a value from CANONICAL_TYPES or None.
    """
    sub_type = row.get('property_sub_type', None)
    struct_type = row.get('structure_type', None)
    sub_type = str(sub_type).strip().lower() if pd.notna(sub_type) and sub_type not in [None, ''] else None
    struct_type = str(struct_type).strip().lower() if pd.notna(struct_type) and struct_type not in [None, ''] else None

    # Prefer sub_type if both exist
    value = sub_type if sub_type else struct_type
    if not value:
        return None

    # Try manual_type_map (exact + fuzzy)
    mapped_type = get_manual_type_fuzzy(value, manual_map, score_cutoff=80)
    result = canonical_check(mapped_type)
    if result is not None:
        return result

    # Try canonical type substrings
    for canon in CANONICAL_TYPES:
        if canon.lower() in value:
            return canon

    # Keyword-based logic (catch broad classes)
    if "condo" in value:
        return "Condominium"
    if "townhouse" in value or "row unit" in value:
        return "Townhouse"
    if "duplex" in value:
        return "Duplex"
    if "triplex" in value:
        return "Triplex"
    if "fourplex" in value or "4 plex" in value:
        return "Fourplex"
    if "semi" in value:
        return "Semi Detached"
    if "detached" in value or "single family" in value or "mobile" in value or "farm" in value:
        return "Detached"

    # Fallback: None (doesn't match canonical)
    return None

# ---- TEST CASES ----
test_rows = [
    {'property_sub_type': 'Freehold Townhouse', 'structure_type': None},
    {'property_sub_type': None, 'structure_type': 'Detached, 2-Storey'},
    {'property_sub_type': None, 'structure_type': None},
    {'property_sub_type': 'Duplex', 'structure_type': 'Detached'},
    {'property_sub_type': 'Condo Townhouse', 'structure_type': 'Condo Townhouse'},
    {'property_sub_type': 'Triplex', 'structure_type': 'Semi-Detached'},
    {'property_sub_type': 'Apartment', 'structure_type': ''},
    {'property_sub_type': '', 'structure_type': 'MobileHome'},
    {'property_sub_type': 'Semi Detached', 'structure_type': 'Duplex'},
    {'property_sub_type': 'Bungalow', 'structure_type': 'Detached, Bungalow'},
 
]

for i, row in enumerate(test_rows, 1):
    print(f"Test {i}: {row} → {standardize_property_type(row)}")


Test 1: {'property_sub_type': 'Freehold Townhouse', 'structure_type': None} → Townhouse
Test 2: {'property_sub_type': None, 'structure_type': 'Detached, 2-Storey'} → Detached
Test 3: {'property_sub_type': None, 'structure_type': None} → None
Test 4: {'property_sub_type': 'Duplex', 'structure_type': 'Detached'} → Duplex
Test 5: {'property_sub_type': 'Condo Townhouse', 'structure_type': 'Condo Townhouse'} → Condominium
Test 6: {'property_sub_type': 'Triplex', 'structure_type': 'Semi-Detached'} → Triplex
Test 7: {'property_sub_type': 'Apartment', 'structure_type': ''} → Condominium
Test 8: {'property_sub_type': '', 'structure_type': 'MobileHome'} → Detached
Test 9: {'property_sub_type': 'Semi Detached', 'structure_type': 'Duplex'} → Semi Detached
Test 10: {'property_sub_type': 'Bungalow', 'structure_type': 'Detached, Bungalow'} → None


## Cell 17: Story/Style Mapping

`STORY_GROUPS` lists our standard house style types.

`STOREY_MAP` converts messy or alternate style values into those standard groups.

Values not found in the map become `None`.


In [123]:
STORY_GROUPS = [
    "Bungalow",
    "Bungalow Raised",
    "1 Storey",
    "1.5 Storey",
    "2 Storey",
    "2.5 Storey",
    "3 Storey",
    "3+ Storey",
    "Split Level",
    "Bi-Level"
]

STOREY_MAP = {
    # Bungalow types
    "bungalow": "Bungalow",
    "bungalow raised": "Bungalow Raised",
    "bungalow(1 storey)": "Bungalow",
    "bungaloft": "Bungalow",   # Optionally, you could keep this separate
    "bun": "Bungalow",         # Typo, treat as bungalow

    # Single storey
    "1 storey": "1 Storey",
    "1 level": "1 Storey",
    "1": "1 Storey",
    "one": "1 Storey",
    "single level apartment": "1 Storey",
    "1 Storey/Apt": "1 Storey",
    "one level": "1 Storey",

    # 1.5 Storey
    "1.5 storey": "1.5 Storey",
    "1 1/2 storey": "1.5 Storey",
    "1 and half storey": "1.5 Storey",
    "1.5 level": "1.5 Storey",
    "1 3/4 storey": "1.5 Storey",

    # 2 Storey
    "2 storey": "2 Storey",
    "two story": "2 Storey",
    "2 level": "2 Storey",
    "attached-si": "2 Storey",    # Often attached 2 storey
    "2 storey split": "2 Storey", # Or could be Split Level
    "two": "2 Storey",
    "2": "2 Storey",

    # 2.5 Storey
    "2.5 storey": "2.5 Storey",
    "2 1/2 storey": "2.5 Storey",
    "2.5 level": "2.5 Storey",

    # 3 Storey and up
    "3 storey": "3 Storey",
    "3-storey": "3 Storey",
    "3 level": "3 Storey",
    "3+ storey": "3+ Storey",
    "3 plus stories": "3+ Storey",
    "3 (or more) storey": "3+ Storey",
    "3": "3 Storey",
    "3.0": "3 Storey",
    "three or more": "3+ Storey",
    "3 level side split": "Split Level",   # Likely split

    # 4/5+ Storey (usually not single family, but handle for completeness)
    "4 level split": "Split Level",
    "5 level split": "Split Level",
    "5 level": "Split Level",

    # Split levels
    "split level": "Split Level",
    "split entry": "Split Level",
    "sidesplit": "Split Level",
    "side split": "Split Level",
    "sidesplit 3": "Split Level",
    "sidesplit 4": "Split Level",
    "backsplit": "Split Level",
    "backsplit 3": "Split Level",
    "backsplit 4": "Split Level",
    "multi-level": "Split Level",
    "multi level unit": "Split Level",
    "multi level": "Split Level",

    # Bi-Level
    "bi-level": "Bi-Level",
    "bi level": "Bi-Level",
}




## Cell 18 - 19: `clean_story_group` Function

This function standardizes story/style values for properties:

- Cleans and lowercases the input.
- Maps the value to a standard group using `STOREY_MAP` (with fuzzy matching).
- If not found, checks for a standard group as a substring in the value.
- Returns the standard group name, or `None` if no match is found.


In [124]:
def clean_story_group(value):
    """
    Cleans up story/style values and groups into standard STORY_GROUPS for subject/comps.
    """
    if not value or pd.isna(value):
        return None
    value = str(value).lower().strip()
    mapped = get_manual_type_fuzzy(value, STOREY_MAP)
    if mapped in STORY_GROUPS:
        return mapped
    # Substring fallback (catch e.g. "something 2 Storey", etc.)
    for group in STORY_GROUPS:
        if group.lower() in value:
            return group
    return None

In [125]:
##-----------Test case---------##

test_story_values = [
    "2 Storey", "1.5 Storey", "Bungalow", "1 Storey", "4 Level Split",
    "2-storey", "Bungalow Raised", "2.5 Storey", "one level",
    "3 Plus Stories", "Split Level", "3+ Storey", "1 level", "Bun", '1'
]

print("Original".ljust(20), "→", "Grouped As")
for val in test_story_values:
    print(val.ljust(20), "→", clean_story_group(val))


Original             → Grouped As
2 Storey             → 2 Storey
1.5 Storey           → 1.5 Storey
Bungalow             → Bungalow
1 Storey             → 1 Storey
4 Level Split        → Split Level
2-storey             → 2 Storey
Bungalow Raised      → Bungalow Raised
2.5 Storey           → 2.5 Storey
one level            → 1 Storey
3 Plus Stories       → 3+ Storey
Split Level          → Split Level
3+ Storey            → 3+ Storey
1 level              → 1 Storey
Bun                  → Bungalow
1                    → 1 Storey


## Cell 20 - 21: Standardize Storey/Style for Candidate Properties

This function cleans and standardizes the story/style type for candidate properties by:
- Preferring the 'style' field, then checking 'levels' if needed.
- Using our mapping and fallback logic to ensure values are grouped into standard STORY_GROUPS.
- Returns None if no valid mapping is found.

This helps make all storey/style data consistent for modeling and analysis.


In [126]:
def standardize_candidate_storey(row):
    """
    Cleans and standardizes story/style info for candidate properties.
    Prefers 'style', then 'levels', else returns None.
    Uses clean_story_group for the mapping logic.
    """
    style = row.get('style', None)
    levels = row.get('levels', None)

    # Try style first
    result = clean_story_group(style)
    if result:
        return result

    # If style fails, try levels
    result = clean_story_group(levels)
    if result:
        return result

    # If both fail, return None
    return None


In [127]:
# --- Example Test Cases for standardize_candidate_storey ---

test_rows = [
    {'style': '2 Storey, Attached-Si', 'levels': '2 Storey, Attached-Si'},       # Should match '2 Storey'
    {'style': 'Bungalow-Raised', 'levels': None},                                # Should match 'Bungalow Raised'
    {'style': None, 'levels': '1 1/2 Storey'},                                   # Should match '1.5 Storey'
    {'style': '3-Storey', 'levels': '3 Level'},                                  # Should match '3 Storey'
    {'style': 'Split Entry', 'levels': ''},                                      # Should match 'Split Level'
    {'style': None, 'levels': None},                                             # Should return None
    {'style': 'Backsplit 4', 'levels': 'Split Level'},                           # Should match 'Split Level'
    {'style': 'Apartment-High-Rise', 'levels': None},                            # Should return None (not in story group)
    {'style': '', 'levels': 'One Level'},                                        # Should match '1 Storey'
    {'style': 'Something Unusual', 'levels': 'Something Else'},                  # Should return None
    {'style': 'Bungaloft', 'levels': ''},                                        # Should match 'Bungalow'
    {'style': '2', 'levels': ''},                                                # Should match '2 Storey' (if mapped in STOREY_MAP)
    {'style': 'Bi-Level', 'levels': None},                                       # Should match 'Bi-Level'
]

print("---- STANDARDIZATION TEST CASES ----")
for i, row in enumerate(test_rows, 1):
    result = standardize_candidate_storey(row)
    print(f"Test {i:2}: {row} --> {result}")


---- STANDARDIZATION TEST CASES ----
Test  1: {'style': '2 Storey, Attached-Si', 'levels': '2 Storey, Attached-Si'} --> 2 Storey
Test  2: {'style': 'Bungalow-Raised', 'levels': None} --> Bungalow Raised
Test  3: {'style': None, 'levels': '1 1/2 Storey'} --> 1.5 Storey
Test  4: {'style': '3-Storey', 'levels': '3 Level'} --> 3 Storey
Test  5: {'style': 'Split Entry', 'levels': ''} --> Split Level
Test  6: {'style': None, 'levels': None} --> None
Test  7: {'style': 'Backsplit 4', 'levels': 'Split Level'} --> Split Level
Test  8: {'style': 'Apartment-High-Rise', 'levels': None} --> None
Test  9: {'style': '', 'levels': 'One Level'} --> 1 Storey
Test 10: {'style': 'Something Unusual', 'levels': 'Something Else'} --> None
Test 11: {'style': 'Bungaloft', 'levels': ''} --> Bungalow
Test 12: {'style': '2', 'levels': ''} --> 2 Storey
Test 13: {'style': 'Bi-Level', 'levels': None} --> Bi-Level


## Cell 22: Clean All Property Data

This master function applies all our cleaning logic to the full subjects, comps, and candidates DataFrames.  
It creates new columns for each cleaned feature and returns the cleaned DataFrames.

**How it works:**

- **Makes copies** of all input DataFrames so the originals are unchanged.
- For each dataset, applies all relevant cleaning functions (date, lot size, GLA, room counts, property type, and stories/styles).
- Combines raw columns and parsed/cleaned columns for full traceability.
- Prints a summary showing the number and percentage of non-missing values in each cleaned feature.

**Returns:**
- `subjects_clean`, `comps_clean`, `candidates_clean`: The cleaned DataFrames, ready for analysis or modeling.

**Example usage:**
```python
subjects_clean, comps_clean, candidates_clean = clean_all_property_data(subjects_df, comps_df, candidates_df)


In [128]:
def clean_all_property_data(subjects_df, comps_df, candidates_df):
    """
    Master function to clean all property datasets with consistent clean column names.
    """
    # Make copies
    subjects_clean = subjects_df.copy()
    comps_clean = comps_df.copy()
    candidates_clean = candidates_df.copy()
    
    print("Starting comprehensive data cleaning...")
    print("-" * 50)
    
    # ========== CLEAN SUBJECTS ==========
    print("\n📋 Cleaning SUBJECTS data...")
    subjects_clean['effective_date_clean'] = subjects_clean['effective_date'].apply(clean_date)
    subjects_clean['lot_size_clean'] = subjects_clean['lot_size_sf'].apply(clean_numeric_with_units)
    subjects_clean['gla_clean'] = subjects_clean['gla'].apply(clean_numeric_with_units)
    subjects_clean['room_count_clean'] = subjects_clean['room_total'].apply(clean_room_count)
    subjects_clean['bedrooms_clean'] = subjects_clean['num_beds'].apply(clean_room_count)
    subjects_clean['bathrooms_clean'] = subjects_clean['num_baths'].apply(clean_bathroom_count)
    subjects_clean['property_type_clean'] = subjects_clean.apply(
        lambda row: standardize_property_type({'property_sub_type': None, 'structure_type': row['structure_type']}), axis=1)
    subjects_clean['stories_clean'] = subjects_clean['style'].apply(clean_story_group)


    # ========== CLEAN COMPS ==========
    print("\n📋 Cleaning COMPS data...")
    comps_clean['distance_to_subject_clean'] = comps_clean['distance_to_subject'].apply(clean_distance)
    comps_clean['sale_date_clean'] = comps_clean['sale_date'].apply(clean_date)
    comps_clean['lot_size_clean'] = comps_clean['lot_size'].apply(clean_numeric_with_units)
    comps_clean['gla_clean'] = comps_clean['gla'].apply(clean_numeric_with_units)
    comps_clean['room_count_clean'] = comps_clean['room_count'].apply(clean_room_count)
    comps_clean['bedrooms_clean'] = comps_clean['bed_count'].apply(clean_room_count)
    comps_clean['bathrooms_clean'] = comps_clean['bath_count'].apply(clean_bathroom_count)
    comps_clean['property_type_clean'] = comps_clean.apply(
        lambda row: standardize_property_type({'property_sub_type': row.get('prop_type'), 'structure_type': None}), axis=1)
    comps_clean['stories_clean'] = comps_clean['stories'].apply(clean_story_group)

    # ========== CLEAN CANDIDATES ==========
    print("\n📋 Cleaning CANDIDATES data...")
    candidates_clean['close_date_clean'] = candidates_clean['close_date'].apply(clean_date)
    candidates_clean['lot_size_clean'] = candidates_clean['lot_size_sf'].apply(clean_numeric_with_units)
    candidates_clean['gla_clean'] = candidates_clean['gla'].apply(clean_numeric_with_units)
    candidates_clean['room_count_clean'] = candidates_clean['room_count'].apply(clean_room_count)
    candidates_clean['bedrooms_clean'] = candidates_clean['bedrooms'].apply(clean_room_count)
    # Bathrooms: combine full & half into a single clean value
    candidates_clean['full_baths_clean'] = pd.to_numeric(candidates_clean['full_baths'], errors='coerce')
    candidates_clean['half_baths_clean'] = pd.to_numeric(candidates_clean['half_baths'], errors='coerce')
    candidates_clean['bathrooms_clean'] = (
        candidates_clean['full_baths_clean'].fillna(0) +
        (candidates_clean['half_baths_clean'].fillna(0) * 0.5)
    )
    candidates_clean['property_type_clean'] = candidates_clean.apply(standardize_property_type, axis=1)
    candidates_clean['stories_clean'] = candidates_clean.apply(standardize_candidate_storey, axis=1)

    # ========== SUMMARY STATISTICS ==========
    print("\n📊 Cleaning Summary:")
    print("-" * 50)
    print(f"\nSUBJECTS ({len(subjects_clean)} records):")
    for col in ['effective_date_clean', 'lot_size_clean', 'gla_clean', 'bedrooms_clean', 'bathrooms_clean', 'property_type_clean', 'stories_clean']:
        non_null = subjects_clean[col].notna().sum()
        pct = (non_null / len(subjects_clean)) * 100
        print(f"  - {col}: {non_null}/{len(subjects_clean)} ({pct:.1f}% populated)")
    
    print(f"\nCOMPS ({len(comps_clean)} records):")
    for col in ['distance_to_subject_clean', 'sale_date_clean', 'lot_size_clean', 'gla_clean', 'bedrooms_clean', 'bathrooms_clean', 'property_type_clean', 'stories_clean']:
        non_null = comps_clean[col].notna().sum()
        pct = (non_null / len(comps_clean)) * 100
        print(f"  - {col}: {non_null}/{len(comps_clean)} ({pct:.1f}% populated)")
    
    print(f"\nCANDIDATES ({len(candidates_clean)} records):")
    for col in ['close_date_clean', 'lot_size_clean', 'gla_clean', 'bedrooms_clean', 'bathrooms_clean', 'property_type_clean', 'stories_clean']:
        non_null = candidates_clean[col].notna().sum()
        pct = (non_null / len(candidates_clean)) * 100
        print(f"  - {col}: {non_null}/{len(candidates_clean)} ({pct:.1f}% populated)")
    
    print("\n✅ Data cleaning complete!")
    return subjects_clean, comps_clean, candidates_clean


## Cell 23: Saving to CSV

In [129]:
cleaned_subjects, cleaned_comps, cleaned_candidates = clean_all_property_data(
    subjects_df, comps_df, candidates_df
)
# Save as needed:
cleaned_subjects.to_csv('../data/cleaned/subjects_cleaned.csv', index=False)
cleaned_comps.to_csv('../data/cleaned/comps_cleaned.csv', index=False)  
cleaned_candidates.to_csv('../data/cleaned/candidates_cleaned.csv', index=False)


Starting comprehensive data cleaning...
--------------------------------------------------

📋 Cleaning SUBJECTS data...

📋 Cleaning COMPS data...

📋 Cleaning CANDIDATES data...

📊 Cleaning Summary:
--------------------------------------------------

SUBJECTS (88 records):
  - effective_date_clean: 88/88 (100.0% populated)
  - lot_size_clean: 72/88 (81.8% populated)
  - gla_clean: 88/88 (100.0% populated)
  - bedrooms_clean: 87/88 (98.9% populated)
  - bathrooms_clean: 87/88 (98.9% populated)
  - property_type_clean: 87/88 (98.9% populated)
  - stories_clean: 88/88 (100.0% populated)

COMPS (264 records):
  - distance_to_subject_clean: 259/264 (98.1% populated)
  - sale_date_clean: 264/264 (100.0% populated)
  - lot_size_clean: 216/264 (81.8% populated)
  - gla_clean: 264/264 (100.0% populated)
  - bedrooms_clean: 264/264 (100.0% populated)
  - bathrooms_clean: 264/264 (100.0% populated)
  - property_type_clean: 261/264 (98.9% populated)
  - stories_clean: 262/264 (99.2% populated)

CAN