# Zonda Data Pre-Processing

Zonda data files are very with most columns unused, so this notebook removes unneeded rows and columns and writes the results to the input folder for other notebooks to process.

In [4]:
import pandas as pd

file_path = '/Users/Shared/Downloads' # If changing this location and using vscode dev containers, also change devcontainer.json mount point

In [5]:
df = pd.read_csv(f'{file_path}/zonda_project_plan.csv', encoding='iso-8859-1')
# Prep input data for use for later reporting
df = df.rename(columns={
    'City': 'City',
    'Builder_name': 'Brand',
    'garage': 'Garage',
    'Lat': 'Latitude',
    'Long': 'Longitude',
    'num_of_beds': 'Bedrooms',
    'num_of_baths': 'Baths',
    'num_of_floors': 'Stories',
    'Plan_Name': 'PlanName',
    'price': 'BasePrice',
    'Project_Name': 'SubDivision',
    'Sales_Rate': 'SalesRate',
    'square_footage': 'BaseSqFt',
    'state': 'State',
    'TypicalLotSize': 'LotSize',
    'Zip_Code': 'Zip'
})
indfput_df = df.drop(['SubDivision', 'Parent_Builder', 'Status', 'cbsa_title', 'County', 'Master_Plan', 'Product_Type', 'Unit_Size_Min', 'Unit_Size_Max',
                           'Sales_Office_Phone', 'Website', 'lot_dimension', 'Lot_Width_Variations', 'Sales_Rate_Last_Three_Months', 'Sales_Rate_Last_Six_Months',
                           'Sales_Rate_Last_12_Months', 'Sold_Out_Date', 'Sales_Change_Date', 'Open_Date', 'Total_Units_Planned', 'Total_Units_Sold',
                           'Total_Remaining', 'Price_Change_Date', 'MinPrice', 'MaxPrice', 'hoa_sqft', 'hoa_1', 'hoa_2', 'Assessments', 'Assessments_Description',
                           'characteristics', 'County_Code', 'cbsa_code', 'MasterPlan_Key', 'Project_Key', 'Builder_name1', 'Project_Name1', 'last_updated_date'], axis=1)
df = df.reindex(columns=['Brand', 'PlanName', 'City', 'State', 'Zip', 'Latitude', 'Longitude', 'BaseSqFt', 'Bedrooms', 'Baths', 'Garage', 'Stories', 'LotSize', 'SalesRate', 'BasePrice'])
df['Zip'] = df['Zip'].astype('str').str.rstrip('.0')
df = df.dropna()
df = df.groupby(['Brand', 'PlanName', 'City', 'State', 'Zip'], as_index=False).mean(numeric_only=True)
# Remove homes where data is outside the realms of realistic for putposes of this model
print(f'Shape before data cleanup = {df.shape}')
df = df[
    (df['Zip'] != 'nan')
    & (df['BasePrice'] >= 50000)
    & (df['BasePrice'] <= 3000000)
    & (df['BaseSqFt'] >= 1000)
    & (df['BaseSqFt'] <= 10000)
    & (df['Bedrooms'] > 0)
    & (df['Bedrooms'] <= 10)
    & (df['Baths'] > 0)
    & (df['Baths'] <= 10)
    & (df['Garage'] >= 0)
    & (df['Garage'] <= 6)
    & (df['Stories'] > 0)
    & (df['Stories'] <= 3)
    & (df['LotSize'] > 1000)
    & (df['LotSize'] <= 100000)
    & (df['SalesRate'] > 0)
    & (df['SalesRate'] <= 50)]
print(f'Shape after data cleanup = {df.shape}')
input_df = df.reset_index(drop=True)
df.to_csv('../inputs/home_sales_zonda.csv', index=False)
input_df.head()

(253758, 15)
(243107, 15)


Unnamed: 0,Brand,PlanName,City,State,Zip,Latitude,Longitude,BaseSqFt,Bedrooms,Baths,Garage,Stories,LotSize,SalesRate,BasePrice
0,1034 NE 72nd Street LLC,Plan 1225,Seattle,WA,98115,47.681056,-122.315907,1225.0,2.0,2.0,0.0,3.0,1016.0,1.7,749900.0
1,1034 NE 72nd Street LLC,Plan 1643,Seattle,WA,98115,47.681056,-122.315907,1643.0,3.0,2.5,0.0,3.0,1016.0,1.7,989900.0
2,1034 NE 72nd Street LLC,Plan 1737,Seattle,WA,98115,47.681056,-122.315907,1737.0,3.0,2.5,0.0,3.0,1016.0,1.7,979900.0
3,13th Floor Homes,Amelia,Tamarac,FL,33319,26.192302,-80.211595,1580.0,3.0,2.5,1.0,2.0,3000.0,4.3,332000.0
4,13th Floor Homes,Amelia,West Palm Beach,FL,33404,26.781443,-80.082896,1558.0,3.0,2.5,1.0,2.0,2900.0,3.7,312990.0
