# Combine Results of BHI and Zonda Models

In [1]:
import pandas as pd

bhi_df = pd.read_csv(f'../outputs/home_prices_bhi.csv')
bhi_df['Zip'] = bhi_df['Zip'].astype('str').str.rstrip('.0')
bhi_home_count = bhi_df.shape[0]
print(bhi_df.shape)
bhi_df.head()

(34693, 13)


Unnamed: 0,Brand,Plan Name,City,State,Zip,Base Sq Ft,Bedrooms,Baths,Garage,Stories,Base Price,Predicted Price,% Difference
0,ABD Development,Courtyard 50,Davenport,FL,33837,2530.0,4.0,3.0,2.0,2.0,594000.0,629078.0,5.736
1,ABD Development,Marbella 3BR,Davenport,FL,33837,1904.0,3.0,2.0,2.0,1.0,475000.0,579921.0,19.891684
2,ABD Development,Marbella 4BR,Davenport,FL,33837,2192.0,4.0,3.0,2.0,1.0,536000.0,617495.0,14.130111
3,ABD Development,Sienna,Davenport,FL,33837,2293.0,4.0,3.0,2.0,1.0,650000.0,629020.0,-3.280597
4,ABD Development,The Bristol,Palm Coast,FL,32137,5021.0,4.0,4.0,3.0,2.0,1110000.0,1121192.0,1.003242


In [2]:
zonda_df = pd.read_csv(f'../outputs/home_prices_zonda.csv')
zonda_df['Zip'] = zonda_df['Zip'].astype('str').str.rstrip('.0')
zonda_home_count = zonda_df.shape[0]
print(zonda_df.shape)
zonda_df.head()

(243069, 17)


Unnamed: 0,Brand,Plan Name,City,State,Zip,Latitude,Longitude,Base Sq Ft,Bedrooms,Baths,Garage,Stories,Lot Size,Sales Rate,Base Price,Predicted Price,% Difference
0,1034 NE 72nd Street LLC,Plan 1225,Seattle,WA,98115,47.681056,-122.315907,1225.0,2.0,2.0,0.0,3.0,1016.0,1.7,749900.0,782103.0,4.204023
1,1034 NE 72nd Street LLC,Plan 1643,Seattle,WA,98115,47.681056,-122.315907,1643.0,3.0,2.5,0.0,3.0,1016.0,1.7,989900.0,983704.0,-0.627912
2,1034 NE 72nd Street LLC,Plan 1737,Seattle,WA,98115,47.681056,-122.315907,1737.0,3.0,2.5,0.0,3.0,1016.0,1.7,979900.0,1028449.0,4.834754
3,13th Floor Homes,Amelia,Tamarac,FL,33319,26.192302,-80.211595,1580.0,3.0,2.5,1.0,2.0,3000.0,4.3,332000.0,350371.0,5.384523
4,13th Floor Homes,Amelia,West Palm Beach,FL,33404,26.781443,-80.082896,1558.0,3.0,2.5,1.0,2.0,2900.0,3.7,312990.0,307142.0,-1.885908


In [15]:
def merge_columns(row, column_name):
    if row[f'{column_name}_x'] and not pd.isna(row[f'{column_name}_x']):
        return row[f'{column_name}_x']
    elif row[f'{column_name}_y'] and not pd.isna(row[f'{column_name}_y']):
        return row[f'{column_name}_y']
    else:
        return None
def get_price(row):
    if row['Base Price BHI'] and not pd.isna(row['Base Price BHI']) and row['Base Price Zonda'] and not pd.isna(row['Base Price Zonda']):
        base_price = (row['Base Price BHI'] + row['Base Price Zonda']) / 2
        predicted_price = (row['Predicted Price BHI'] + row['Predicted Price Zonda']) / 2
    elif row['Base Price BHI'] and not pd.isna(row['Base Price BHI']):
        base_price = row['Base Price BHI']
        predicted_price = row['Predicted Price BHI']
    elif row['Base Price Zonda'] and not pd.isna(row['Base Price Zonda']):
        base_price = row['Base Price Zonda']
        predicted_price = row['Predicted Price Zonda']
    percent_difference = (predicted_price - base_price) / ((predicted_price + base_price) / 2) * 100
    return pd.Series([predicted_price, percent_difference], index=['Predicted Price', '% Difference'])
df = zonda_df.merge(bhi_df, on=['Brand', 'Plan Name', 'Zip'], how='outer')
df['City'] = df.apply(lambda row: merge_columns(row, 'City'), axis=1)
df['State'] = df.apply(lambda row: merge_columns(row, 'State'), axis=1)
df['Base Sq Ft'] = df.apply(lambda row: merge_columns(row, 'Base Sq Ft'), axis=1)
df['Bedrooms'] = df.apply(lambda row: merge_columns(row, 'Bedrooms'), axis=1)
df['Baths'] = df.apply(lambda row: merge_columns(row, 'Baths'), axis=1)
df['Garage'] = df.apply(lambda row: merge_columns(row, 'Garage'), axis=1)
df['Stories'] = df.apply(lambda row: merge_columns(row, 'Stories'), axis=1)
df = df.rename(columns={
    'Base Price_x': 'Base Price Zonda',
    'Base Price_y': 'Base Price BHI',
    'Predicted Price_x': 'Predicted Price Zonda',
    'Predicted Price_y': 'Predicted Price BHI'
})
df[['Predicted Price', '% Difference']] = df.apply(lambda row: get_price(row), axis=1)
df = df.filter(regex='^(?!.*_y$)') # filter out columns ending in '_y'
df = df.drop(['Predicted Price BHI', 'Predicted Price Zonda'], axis=1)
df = df.reindex(columns=['Brand', 'Plan Name', 'City', 'State', 'Zip', 'Latitude', 'Longitude', 'Base Sq Ft', 'Bedrooms', 'Baths', 'Garage', 'Stories', 'Base Price BHI', 'Base Price Zonda', 'Predicted Price', '% Difference'])
df = df.drop_duplicates()
df.to_csv('../outputs/home_prices.csv', index=False)
matched_df = df[df['Base Price BHI'].notnull()]
matched_df.sample(10)

Unnamed: 0,Brand,Plan Name,City,State,Zip,Latitude,Longitude,Base Sq Ft,Bedrooms,Baths,Garage,Stories,Base Price BHI,Base Price Zonda,Predicted Price,% Difference
253006,Pulte Homes,Sonoma Cove,Canton,GA,30115,,,2536.0,2.0,2.0,2.0,1.0,496990.0,,529592.0,6.351563
126988,LGI Homes,Trinity,Cleveland,TX,77328,30.286517,-95.167657,1414.0,3.0,2.0,2.0,1.0,239900.0,239900.0,232034.5,-3.333302
59193,David Weekley Homes,Ellerby,Brookshire,TX,77423,29.755445,-95.899422,2515.0,4.0,3.0,2.0,2.0,381990.0,371990.0,360115.0,-4.578724
177697,Perry Homes,2420W,Katy,TX,77493,29.819221,-95.808294,2420.0,4.0,3.0,3.0,1.0,477900.0,390400.0,490875.0,12.264533
249021,Kay Builders,Sheridan II,Emmaus,PA,18049,,,2234.0,3.0,3.0,2.0,2.0,491600.0,,486291.0,-1.085806
64082,DiVosta Homes,Hallmark,Englewood,FL,34223,27.012657,-82.383558,1655.0,2.0,2.0,2.0,1.0,445990.0,438990.0,506437.0,13.477749
250488,Lennar,Residence Two,Moreno Valley,CA,92551,,,1567.0,3.0,2.0,2.0,2.0,405990.0,,428212.0,5.327726
161720,Meritage Homes,Paradise,Tucson,AZ,85756,32.120727,-110.857615,2575.0,4.0,2.5,2.0,2.0,398990.0,398990.0,417687.5,4.578919
247988,ICI Homes,Avery II,Ormond Beach,FL,32164,,,1993.0,3.0,2.0,2.0,1.0,529900.0,,512519.0,-3.334744
12350,Beazer Homes,Garner,Mount Juliet,TN,37122,36.224819,-86.539227,2567.0,4.0,2.5,2.0,2.0,514990.0,387990.0,455022.0,0.779251
