# Zonda Data Pre-Processing

Zonda data files are very with most columns unused, so this notebook removes unneeded rows and columns and writes the results to the input folder for other notebooks to process.

In [1]:
import pandas as pd

file_path = '/Users/Shared/Downloads' # If changing this location and using vscode dev containers, also change devcontainer.json mount point

In [2]:
df = pd.read_csv(f'{file_path}/zonda_project_plan.csv', encoding='iso-8859-1')
df.to_csv(f'{file_path}/zonda_project_plan_with_id.csv', index_label='ID', index=True)

In [3]:
input_df = pd.read_csv(f'{file_path}/zonda_project_plan_with_id.csv', encoding='iso-8859-1')
# Prep input data for use for later reporting
df = input_df.rename(columns={
    'City': 'City',
    'Builder_name': 'Brand',
    'garage': 'Garage',
    'Lat': 'Latitude',
    'Long': 'Longitude',
    'num_of_beds': 'Bedrooms',
    'num_of_baths': 'Baths',
    'num_of_floors': 'Stories',
    'Plan_Name': 'Plan Name',
    'price': 'Base Price',
    'Sales_Rate': 'Sales Rate',
    'square_footage': 'Base Sq Ft',
    'state': 'State',
    'TypicalLotSize': 'Lot Size',
    'Zip_Code': 'Zip',
    'Project_Name': 'Subdivision' # may use later
})
df = df.drop(['Subdivision', 'Project_Key', 'Parent_Builder', 'Status', 'cbsa_title', 'County', 'Master_Plan', 'Product_Type', 'Unit_Size_Min', 'Unit_Size_Max',
                           'Sales_Office_Phone', 'Website', 'lot_dimension', 'Lot_Width_Variations', 'Sales_Rate_Last_Three_Months', 'Sales_Rate_Last_Six_Months',
                           'Sales_Rate_Last_12_Months', 'Sold_Out_Date', 'Sales_Change_Date', 'Open_Date', 'Total_Units_Planned', 'Total_Units_Sold',
                           'Total_Remaining', 'Price_Change_Date', 'MinPrice', 'MaxPrice', 'hoa_sqft', 'hoa_1', 'hoa_2', 'Assessments', 'Assessments_Description',
                           'characteristics', 'County_Code', 'cbsa_code', 'MasterPlan_Key', 'Builder_name1', 'Project_Name1', 'last_updated_date'], axis=1)
df = df.reindex(columns=['ID', 'Brand', 'Plan Name', 'City', 'State', 'Zip', 'Latitude', 'Longitude', 'Base Sq Ft', 'Bedrooms', 'Baths', 'Garage', 'Stories', 'Lot Size', 'Sales Rate', 'Base Price'])
df['Zip'] = df['Zip'].astype('str').str.rstrip('.0')
df = df.dropna()
df = df.groupby(['Brand', 'Plan Name', 'City', 'State', 'Zip'], as_index=False).agg({
    'ID': 'first',
    'Latitude': 'mean',
    'Longitude': 'mean',
    'Base Sq Ft': 'mean',
    'Bedrooms': 'mean',
    'Baths': 'mean',
    'Garage': 'mean',
    'Stories': 'mean',
    'Lot Size': 'mean',
    'Sales Rate': 'mean',
    'Base Price': 'mean'
})
# Remove homes where data is outside the realms of realistic for putposes of this model
print(f'Shape before data cleanup = {df.shape}')
df = df[
    (df['Zip'] != 'nan')
    & (df['Base Price'] >= 50000)
    & (df['Base Price'] <= 3000000)
    & (df['Base Sq Ft'] >= 1000)
    & (df['Base Sq Ft'] <= 10000)
    & (df['Bedrooms'] > 0)
    & (df['Bedrooms'] <= 10)
    & (df['Baths'] > 0)
    & (df['Baths'] <= 10)
    & (df['Garage'] >= 0)
    & (df['Garage'] <= 6)
    & (df['Stories'] > 0)
    & (df['Stories'] <= 3)
    & (df['Lot Size'] > 1000)
    & (df['Lot Size'] <= 100000)
    & (df['Sales Rate'] > 0)
    & (df['Sales Rate'] <= 50)]
# Zonda data is manually entered and prone to human entry error, so we may need to remove some rows that are way outside the norm
df = df[~df['ID'].isin([404582, 330948, 330886, 34828, 34829, 161423, 58042, 56368, 56369, 56367, 375455, 62058, 42804, 375989, 237548, 153298, 58043, 79965, 139296, 58040])] # price way too low for the combined data
df = df[~df['ID'].isin([13599, 177191, 35980, 47165, 1317, 215520, 250748, 307978, 142227, 310893, 312402, 368747, 318132, 318133, 349206, 380374, 219227, 219227, 318134, 318134])] # price way too high for the combined data
print(f'Shape after data cleanup = {df.shape}')
df = df.reset_index(drop=True)
df.to_csv('../inputs/home_sales_zonda.csv', index=False)
df.head()

Shape before data cleanup = (253758, 16)
Shape after data cleanup = (243069, 16)


Unnamed: 0,Brand,Plan Name,City,State,Zip,ID,Latitude,Longitude,Base Sq Ft,Bedrooms,Baths,Garage,Stories,Lot Size,Sales Rate,Base Price
0,1034 NE 72nd Street LLC,Plan 1225,Seattle,WA,98115,300069,47.681056,-122.315907,1225.0,2.0,2.0,0.0,3.0,1016.0,1.7,749900.0
1,1034 NE 72nd Street LLC,Plan 1643,Seattle,WA,98115,300067,47.681056,-122.315907,1643.0,3.0,2.5,0.0,3.0,1016.0,1.7,989900.0
2,1034 NE 72nd Street LLC,Plan 1737,Seattle,WA,98115,300068,47.681056,-122.315907,1737.0,3.0,2.5,0.0,3.0,1016.0,1.7,979900.0
3,13th Floor Homes,Amelia,Tamarac,FL,33319,131033,26.192302,-80.211595,1580.0,3.0,2.5,1.0,2.0,3000.0,4.3,332000.0
4,13th Floor Homes,Amelia,West Palm Beach,FL,33404,19032,26.781443,-80.082896,1558.0,3.0,2.5,1.0,2.0,2900.0,3.7,312990.0
