# DC Real Estate Predictor

## Predict the recommended list price for a building given the number of rooms, bedrooms, bathrooms, and building type.

In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
# Import DC Residentail ONLY for now
from pandas.io.json import json_normalize

dc_res_url = 'https://opendata.arcgis.com/datasets/c5fb3fbe4c694a59a6eef7bf5f8bc49a_25.geojson'
    
json = requests.get(dc_res_url).json()

results = json['features']


In [3]:
# Karl Lorey - create temporary dataframe and keep only required columns/ rename them to keep clean.
class DataFrameFromDict(object):
    """
    Temporarily imports data frame columns and deletes them afterwards.
    """

    def __init__(self, data):
        self.df = pd.json_normalize(data)
        self.columns = list(self.df.columns.values)

    def __enter__(self):
        return self.df

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.df.drop([c for c in self.columns], axis=1, inplace=True)
        
        
with DataFrameFromDict(results) as residential:
    
    residential['bathrooms'] = residential['properties.BATHRM']
    residential['heat_d'] = residential['properties.HEAT_D']
    residential['heat_code'] = residential['properties.HEAT']
    residential['ac'] = residential['properties.AC']
    residential['total_rooms'] = residential['properties.ROOMS']
    residential['bedrooms'] = residential['properties.BEDRM']
    residential['most_recent_build'] = residential['properties.EYB']
    residential['remodel_yr'] = residential['properties.YR_RMDL']
    residential['stories'] = residential['properties.STORIES']
    residential['sale_date'] = residential['properties.SALEDATE']
    residential['price'] = residential['properties.PRICE']
    residential['structure_d'] = residential['properties.STRUCT_D']
    residential['structure'] = residential['properties.STRUCT']
    residential['kitchens'] = residential['properties.KITCHENS']
    residential['fireplaces'] = residential['properties.FIREPLACES']
    residential['land_area'] = residential['properties.LANDAREA']

residential.head()

Unnamed: 0,bathrooms,heat_d,heat_code,ac,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,sale_date,price,structure_d,structure,kitchens,fireplaces,land_area
0,2.0,Warm Cool,7.0,Y,13.0,6.0,1962,1990.0,3.0,1900/01/01 00:00:00+00,,Row Inside,7.0,2.0,0.0,1800
1,4.0,Warm Cool,7.0,Y,13.0,6.0,1962,1975.0,3.0,2016/01/19 00:00:00+00,1000000.0,Row Inside,7.0,2.0,2.0,1800
2,3.0,Warm Cool,7.0,Y,10.0,4.0,1971,2010.0,3.0,2001/10/11 00:00:00+00,701800.0,Row Inside,7.0,2.0,3.0,1800
3,2.0,Warm Cool,7.0,Y,8.0,3.0,1988,2015.0,3.0,2018/04/26 00:00:00+00,0.0,Row Inside,7.0,2.0,3.0,1800
4,2.0,Hot Water Rad,13.0,N,8.0,4.0,1965,1985.0,3.0,2000/06/06 00:00:00+00,442500.0,Row Inside,7.0,1.0,2.0,1800


In [4]:
# import plotly.express as px

# fig = px.histogram(residential.price)

# fig.show()

In [5]:
# Checking null rows of data

print(residential.isnull().sum())

bathrooms              924
heat_d                 924
heat_code              924
ac                     924
total_rooms            981
bedrooms               935
most_recent_build        0
remodel_yr           56119
stories                973
sale_date                0
price                16778
structure_d            924
structure              924
kitchens               925
fireplaces             927
land_area                0
dtype: int64


In [6]:
# Checking how the data is missing across rows

residential[residential['ac'].isna()]

Unnamed: 0,bathrooms,heat_d,heat_code,ac,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,sale_date,price,structure_d,structure,kitchens,fireplaces,land_area
96,,,,,,,0,,,2018/11/29 00:00:00+00,605900.0,,,,,13
636,,,,,,,0,,,2019/10/08 00:00:00+00,1756000.0,,,,,41
637,,,,,,,0,,,2019/07/05 00:00:00+00,1230000.0,,,,,41
638,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,590
639,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,1077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102852,,,,,,,0,,,2020/03/11 00:00:00+00,293000.0,,,,,206
102853,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,205
102854,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,205
102855,,,,,,,0,,,2020/03/11 00:00:00+00,315000.0,,,,,205


### Since nulls are persistant across rows, dropping rows with mulitlpe nulls based on column with lowest amount of msising nulls

## Data Clean Up

### All Data

In [7]:
# Going to drop rows with nulls across dataset first since only about 1000 records are all missing same amount of data
residential = residential[residential['ac'].notnull()]
residential

Unnamed: 0,bathrooms,heat_d,heat_code,ac,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,sale_date,price,structure_d,structure,kitchens,fireplaces,land_area
0,2.0,Warm Cool,7.0,Y,13.0,6.0,1962,1990.0,3.0,1900/01/01 00:00:00+00,,Row Inside,7.0,2.0,0.0,1800
1,4.0,Warm Cool,7.0,Y,13.0,6.0,1962,1975.0,3.0,2016/01/19 00:00:00+00,1000000.0,Row Inside,7.0,2.0,2.0,1800
2,3.0,Warm Cool,7.0,Y,10.0,4.0,1971,2010.0,3.0,2001/10/11 00:00:00+00,701800.0,Row Inside,7.0,2.0,3.0,1800
3,2.0,Warm Cool,7.0,Y,8.0,3.0,1988,2015.0,3.0,2018/04/26 00:00:00+00,0.0,Row Inside,7.0,2.0,3.0,1800
4,2.0,Hot Water Rad,13.0,N,8.0,4.0,1965,1985.0,3.0,2000/06/06 00:00:00+00,442500.0,Row Inside,7.0,1.0,2.0,1800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108032,2.0,Water Base Brd,6.0,N,6.0,3.0,1948,,2.0,1900/01/01 00:00:00+00,,Semi-Detached,8.0,1.0,0.0,2500
108033,2.0,Hot Water Rad,13.0,N,6.0,3.0,1958,2004.0,2.0,2019/10/08 00:00:00+00,238000.0,Semi-Detached,8.0,1.0,0.0,2500
108034,2.0,Hot Water Rad,13.0,N,6.0,3.0,1948,,2.0,2006/10/02 00:00:00+00,0.0,Multi,2.0,2.0,0.0,2500
108035,1.0,Forced Air,1.0,N,6.0,3.0,1955,,2.0,1900/01/01 00:00:00+00,,Semi-Detached,8.0,1.0,0.0,2500


In [8]:
residential.isnull().sum()

bathrooms                0
heat_d                   0
heat_code                0
ac                       0
total_rooms             57
bedrooms                11
most_recent_build        0
remodel_yr           55196
stories                 49
sale_date                0
price                16775
structure_d              0
structure                0
kitchens                 1
fireplaces               3
land_area                0
dtype: int64

### Remodel Year

In [9]:
# Replace remodel nulls with zeros for now and convert to int

residential.remodel_yr.fillna(0, inplace = True)
residential.remodel_yr = residential.remodel_yr.round(0).astype(int)

residential.remodel_yr

0         1990
1         1975
2         2010
3         2015
4         1985
          ... 
108032       0
108033    2004
108034       0
108035       0
108036       0
Name: remodel_yr, Length: 107113, dtype: int64

## NEED to ensure that replacing date with zero is okay for model training? Put in ignore parameter for zeros in column??

### Total Rooms, Bedrooms, Stories, Price, Kitchens, Fireplaces

In [10]:
# Data correlations to see if MICE will perform well given features

residential.corr()

# Graph of correlations

# import seaborn as sns

# sns.pairplot(residential)

Unnamed: 0,bathrooms,heat_code,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,price,structure,kitchens,fireplaces,land_area
bathrooms,1.0,-0.129434,0.636576,0.627672,0.391717,0.252936,0.10147,0.371961,-0.277473,0.416459,0.386219,0.218819
heat_code,-0.129434,1.0,0.049436,0.017733,-0.305942,-0.15741,-0.003362,-0.074649,-0.02886,0.06249,0.014774,-0.007968
total_rooms,0.636576,0.049436,1.0,0.684704,0.157805,0.100655,0.090859,0.209955,-0.273066,0.510503,0.287956,0.226494
bedrooms,0.627672,0.017733,0.684704,1.0,0.190396,0.139418,0.093433,0.2378,-0.236911,0.350918,0.294132,0.206795
most_recent_build,0.391717,-0.305942,0.157805,0.190396,1.0,0.093938,0.087387,0.253866,-0.055723,-0.042643,0.198931,0.078442
remodel_yr,0.252936,-0.15741,0.100655,0.139418,0.093938,1.0,0.028041,0.204601,-0.011321,0.07564,0.170305,0.007554
stories,0.10147,-0.003362,0.090859,0.093433,0.087387,0.028041,1.0,0.072935,0.068854,0.050669,0.079227,-0.0179
price,0.371961,-0.074649,0.209955,0.2378,0.253866,0.204601,0.072935,1.0,-0.085043,0.04501,0.311801,0.121326
structure,-0.277473,-0.02886,-0.273066,-0.236911,-0.055723,-0.011321,0.068854,-0.085043,1.0,-0.074058,-0.292898,-0.314822
kitchens,0.416459,0.06249,0.510503,0.350918,-0.042643,0.07564,0.050669,0.04501,-0.074058,1.0,-0.00295,-0.021149


### When comparing price to the other features, we can see fairly weak correlations between all pairs. Hopefully clustering or regression will lead to combinations that are decent predictors of residential prices in DC. It may be worth replicating a similar methodology to the commercial properties in DC to see if they are easier or harder to predict.

### Creating table with mapping for categorical codes

In [11]:
residential.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107113 entries, 0 to 108036
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   bathrooms          107113 non-null  float64
 1   heat_d             107113 non-null  object 
 2   heat_code          107113 non-null  float64
 3   ac                 107113 non-null  object 
 4   total_rooms        107056 non-null  float64
 5   bedrooms           107102 non-null  float64
 6   most_recent_build  107113 non-null  int64  
 7   remodel_yr         107113 non-null  int64  
 8   stories            107064 non-null  float64
 9   sale_date          107113 non-null  object 
 10  price              90338 non-null   float64
 11  structure_d        107113 non-null  object 
 12  structure          107113 non-null  float64
 13  kitchens           107112 non-null  float64
 14  fireplaces         107110 non-null  float64
 15  land_area          107113 non-null  int64  
dtypes:

In [17]:
# Groupby dataframe of categorical and categorical codes for mapping

# Heat codes
heat_codes = residential.groupby(['heat_d']).mean()

heat_codes = heat_codes['heat_code']


# AC codes, easy enough
residential['ac_code'] = np.where(residential.ac == 'Y', 1, 0)

# Structure codes
# Using pandas categorical dtype for structure codes

residential['structure_d'] = residential.structure_d.astype('category')

residential['structure_code'] = residential["structure_d"].cat.codes

residential

Unnamed: 0,bathrooms,heat_d,heat_code,ac,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,sale_date,price,structure_d,structure,kitchens,fireplaces,land_area,ac_code,structure_code
0,2.0,Warm Cool,7.0,Y,13.0,6.0,1962,1990,3.0,1900/01/01 00:00:00+00,,Row Inside,7.0,2.0,0.0,1800,1,4
1,4.0,Warm Cool,7.0,Y,13.0,6.0,1962,1975,3.0,2016/01/19 00:00:00+00,1000000.0,Row Inside,7.0,2.0,2.0,1800,1,4
2,3.0,Warm Cool,7.0,Y,10.0,4.0,1971,2010,3.0,2001/10/11 00:00:00+00,701800.0,Row Inside,7.0,2.0,3.0,1800,1,4
3,2.0,Warm Cool,7.0,Y,8.0,3.0,1988,2015,3.0,2018/04/26 00:00:00+00,0.0,Row Inside,7.0,2.0,3.0,1800,1,4
4,2.0,Hot Water Rad,13.0,N,8.0,4.0,1965,1985,3.0,2000/06/06 00:00:00+00,442500.0,Row Inside,7.0,1.0,2.0,1800,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108032,2.0,Water Base Brd,6.0,N,6.0,3.0,1948,0,2.0,1900/01/01 00:00:00+00,,Semi-Detached,8.0,1.0,0.0,2500,0,5
108033,2.0,Hot Water Rad,13.0,N,6.0,3.0,1958,2004,2.0,2019/10/08 00:00:00+00,238000.0,Semi-Detached,8.0,1.0,0.0,2500,0,5
108034,2.0,Hot Water Rad,13.0,N,6.0,3.0,1948,0,2.0,2006/10/02 00:00:00+00,0.0,Multi,2.0,2.0,0.0,2500,0,1
108035,1.0,Forced Air,1.0,N,6.0,3.0,1955,0,2.0,1900/01/01 00:00:00+00,,Semi-Detached,8.0,1.0,0.0,2500,0,5


### Multiple Imputation

In [None]:
# Going to use MICE to get better estimates on price given other features

# Drop categorical

