# DC Real Estate Predictor

## Predict the recommended list price for a building given the number of rooms, bedrooms, bathrooms, and building type.

In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
# Import DC Residentail ONLY for now
from pandas.io.json import json_normalize

dc_res_url = 'https://opendata.arcgis.com/datasets/c5fb3fbe4c694a59a6eef7bf5f8bc49a_25.geojson'
    
json = requests.get(dc_res_url).json()

results = json['features']


In [3]:
# Karl Lorey - create temporary dataframe and keep only required columns/ rename them to keep clean.
class DataFrameFromDict(object):
    """
    Temporarily imports data frame columns and deletes them afterwards.
    """

    def __init__(self, data):
        self.df = pd.json_normalize(data)
        self.columns = list(self.df.columns.values)

    def __enter__(self):
        return self.df

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.df.drop([c for c in self.columns], axis=1, inplace=True)
        
        
with DataFrameFromDict(results) as residential:
    
    residential['bathrooms'] = residential['properties.BATHRM']
    residential['heat_d'] = residential['properties.HEAT_D']
    residential['heat_code'] = residential['properties.HEAT']
    residential['ac'] = residential['properties.AC']
    residential['total_rooms'] = residential['properties.ROOMS']
    residential['bedrooms'] = residential['properties.BEDRM']
    residential['most_recent_build'] = residential['properties.EYB']
    residential['remodel_yr'] = residential['properties.YR_RMDL']
    residential['stories'] = residential['properties.STORIES']
    residential['sale_date'] = residential['properties.SALEDATE']
    residential['price'] = residential['properties.PRICE']
    residential['structure_d'] = residential['properties.STRUCT_D']
    residential['structure'] = residential['properties.STRUCT']
    residential['kitchens'] = residential['properties.KITCHENS']
    residential['fireplaces'] = residential['properties.FIREPLACES']
    residential['land_area'] = residential['properties.LANDAREA']

residential.head()

Unnamed: 0,bathrooms,heat_d,heat_code,ac,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,sale_date,price,structure_d,structure,kitchens,fireplaces,land_area
0,1.0,Forced Air,1.0,Y,6.0,3.0,1956,,2.0,1996/10/10 00:00:00+00,147500.0,Single,1.0,1.0,0.0,2531
1,1.0,Hot Water Rad,13.0,N,6.0,3.0,1956,,2.0,2016/01/04 00:00:00+00,156000.0,Single,1.0,1.0,0.0,2376
2,3.0,Hot Water Rad,13.0,Y,7.0,4.0,1980,,2.0,2019/08/09 00:00:00+00,302000.0,Single,1.0,2.0,0.0,2376
3,4.0,Hot Water Rad,13.0,N,12.0,4.0,1956,,2.0,1997/01/08 00:00:00+00,105377.0,Multi,2.0,4.0,0.0,2376
4,4.0,Hot Water Rad,13.0,N,14.0,4.0,1966,2012.0,2.0,2011/10/26 00:00:00+00,170000.0,Multi,2.0,4.0,0.0,2376


In [4]:
# import plotly.express as px

# fig = px.histogram(residential.price)

# fig.show()

In [5]:
# Checking null rows of data

print(residential.isnull().sum())

bathrooms             1038
heat_d                1038
heat_code             1038
ac                    1038
total_rooms           1095
bedrooms              1049
most_recent_build        0
remodel_yr           56212
stories               1087
sale_date                0
price                16759
structure_d           1038
structure             1038
kitchens              1039
fireplaces            1041
land_area                0
dtype: int64


In [6]:
# Checking how the data is missing across rows

residential[residential['ac'].isna()]

Unnamed: 0,bathrooms,heat_d,heat_code,ac,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,sale_date,price,structure_d,structure,kitchens,fireplaces,land_area
3702,,,,,,,0,,,2020/05/27 00:00:00+00,7000.0,,,,,206
3703,,,,,,,0,,,2020/01/29 00:00:00+00,310000.0,,,,,206
3704,,,,,,,0,,,2020/03/11 00:00:00+00,293000.0,,,,,206
3705,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,205
3706,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104804,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,927
104805,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,928
104806,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,928
104807,,,,,,,0,,,1900/01/01 00:00:00+00,0.0,,,,,927


### Since nulls are persistant across rows, dropping rows with mulitlpe nulls based on column with lowest amount of msising nulls

## Data Clean Up

### All Data

In [7]:
# Going to drop rows with nulls across dataset first since only about 1000 records are all missing same amount of data
residential = residential[residential['ac'].notnull()]
residential

Unnamed: 0,bathrooms,heat_d,heat_code,ac,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,sale_date,price,structure_d,structure,kitchens,fireplaces,land_area
0,1.0,Forced Air,1.0,Y,6.0,3.0,1956,,2.00,1996/10/10 00:00:00+00,147500.0,Single,1.0,1.0,0.0,2531
1,1.0,Hot Water Rad,13.0,N,6.0,3.0,1956,,2.00,2016/01/04 00:00:00+00,156000.0,Single,1.0,1.0,0.0,2376
2,3.0,Hot Water Rad,13.0,Y,7.0,4.0,1980,,2.00,2019/08/09 00:00:00+00,302000.0,Single,1.0,2.0,0.0,2376
3,4.0,Hot Water Rad,13.0,N,12.0,4.0,1956,,2.00,1997/01/08 00:00:00+00,105377.0,Multi,2.0,4.0,0.0,2376
4,4.0,Hot Water Rad,13.0,N,14.0,4.0,1966,2012.0,2.00,2011/10/26 00:00:00+00,170000.0,Multi,2.0,4.0,0.0,2376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108129,1.0,Warm Cool,7.0,Y,7.0,3.0,1959,,2.00,2019/10/25 00:00:00+00,275000.0,Semi-Detached,8.0,1.0,0.0,2500
108130,1.0,Hot Water Rad,13.0,N,7.0,3.0,1959,,1.75,1900/01/01 00:00:00+00,,Single,1.0,1.0,0.0,2500
108131,2.0,Forced Air,1.0,Y,6.0,3.0,1969,2014.0,2.00,2014/12/19 00:00:00+00,350000.0,Single,1.0,1.0,0.0,2340
108132,2.0,Forced Air,1.0,Y,8.0,3.0,1969,2020.0,1.25,2019/05/21 00:00:00+00,230000.0,Single,1.0,1.0,0.0,2214


In [8]:
residential.isnull().sum()

bathrooms                0
heat_d                   0
heat_code                0
ac                       0
total_rooms             57
bedrooms                11
most_recent_build        0
remodel_yr           55175
stories                 49
sale_date                0
price                16756
structure_d              0
structure                0
kitchens                 1
fireplaces               3
land_area                0
dtype: int64

### Price

### Given the large amount of $0 dollar values for sales price, and that it is our target variable, I am going to drop rows with 0 and nulls for housing price. This should still leave plenty of data (50K) to train ans test the mocel on, while also not using the imputed prices as target feature (estimate of an estimate)

In [9]:
# Dropping price values equal to zero, using inplace to do my best at limiting memory space

index_drops = residential[residential.price == 0].index

residential.drop(index_drops, inplace = True)

residential.dropna(subset = ['price'], inplace = True)

### Remodel Year

In [10]:
# Replace remodel nulls with zeros for now and convert to int

residential.remodel_yr.fillna(0, inplace = True)
residential.remodel_yr = residential.remodel_yr.round(0).astype(int)

residential.remodel_yr

0            0
1            0
2            0
3            0
4         2012
          ... 
108128    2005
108129       0
108131    2014
108132    2020
108133    1992
Name: remodel_yr, Length: 60037, dtype: int64

## NEED to ensure that replacing date with zero is okay for model training? Put in ignore parameter for zeros in column??

### Total Rooms, Bedrooms, Stories, Price, Kitchens, Fireplaces

In [11]:
# Data correlations to see if MICE will perform well given features

residential.corr()

# Graph of correlations

# import seaborn as sns

# sns.pairplot(residential)

Unnamed: 0,bathrooms,heat_code,total_rooms,bedrooms,most_recent_build,remodel_yr,stories,price,structure,kitchens,fireplaces,land_area
bathrooms,1.0,-0.103445,0.624652,0.643309,0.367332,0.187293,0.076657,0.431192,-0.284983,0.390525,0.354741,0.320018
heat_code,-0.103445,1.0,0.064606,0.030813,-0.299173,-0.135016,-0.001189,-0.011876,-0.051254,0.087692,0.055198,0.014521
total_rooms,0.624652,0.064606,1.0,0.661641,0.135171,0.060513,0.065114,0.279876,-0.28844,0.509215,0.26073,0.292154
bedrooms,0.643309,0.030813,0.661641,1.0,0.167952,0.10971,0.063794,0.302982,-0.270313,0.356267,0.27611,0.308141
most_recent_build,0.367332,-0.299173,0.135171,0.167952,1.0,-0.038541,0.082306,0.266611,-0.034777,-0.097989,0.175455,0.099479
remodel_yr,0.187293,-0.135016,0.060513,0.10971,-0.038541,1.0,0.008515,0.161178,0.017047,0.05627,0.133059,-0.006015
stories,0.076657,-0.001189,0.065114,0.063794,0.082306,0.008515,1.0,0.078165,0.049978,0.032002,0.069381,-0.026493
price,0.431192,-0.011876,0.279876,0.302982,0.266611,0.161178,0.078165,1.0,-0.129706,0.044207,0.41641,0.318086
structure,-0.284983,-0.051254,-0.28844,-0.270313,-0.034777,0.017047,0.049978,-0.129706,1.0,-0.090508,-0.264731,-0.564157
kitchens,0.390525,0.087692,0.509215,0.356267,-0.097989,0.05627,0.032002,0.044207,-0.090508,1.0,-0.010594,-0.037811


### When comparing price to the other features, we can see fairly weak correlations between all pairs. Hopefully clustering or regression will lead to combinations that are decent predictors of residential prices in DC. It may be worth replicating a similar methodology to the commercial properties in DC to see if they are easier or harder to predict.

### Creating table with mapping for categorical codes

In [12]:
residential.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60037 entries, 0 to 108133
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bathrooms          60037 non-null  float64
 1   heat_d             60037 non-null  object 
 2   heat_code          60037 non-null  float64
 3   ac                 60037 non-null  object 
 4   total_rooms        60002 non-null  float64
 5   bedrooms           60032 non-null  float64
 6   most_recent_build  60037 non-null  int64  
 7   remodel_yr         60037 non-null  int64  
 8   stories            60000 non-null  float64
 9   sale_date          60037 non-null  object 
 10  price              60037 non-null  float64
 11  structure_d        60037 non-null  object 
 12  structure          60037 non-null  float64
 13  kitchens           60036 non-null  float64
 14  fireplaces         60035 non-null  float64
 15  land_area          60037 non-null  int64  
dtypes: float64(9), int64(

In [13]:
# Groupby dataframe of categorical and categorical codes for mapping

# Heat codes
heat_codes = residential.groupby(['heat_d']).mean()

heat_codes = heat_codes['heat_code']


# AC codes, easy enough
residential['ac_code'] = np.where(residential.ac == 'Y', 1, 0)

# Structure codes
# Using pandas categorical dtype for structure codes

residential['structure_d'] = residential.structure_d.astype('category')

residential['structure_code'] = residential["structure_d"].cat.codes

structure_codes = residential.groupby(['structure_d']).mean()

structure_codes = structure_codes['structure_code']

structure_codes

structure_d
Default          0
Multi            1
No Data          2
Row End          3
Row Inside       4
Semi-Detached    5
Single           6
Town End         7
Town Inside      8
Name: structure_code, dtype: int8

### Feature Dataset

In [14]:
# Drop categorical
residential_num = residential.drop(['heat_d', 'ac', 'structure', 'structure_d', 'structure', 'sale_date'], axis = 1)

# Need to verify if the price listed in the data is in dollar value for that year, or is inflation adjusted. If it needs to be adjusted, will need to bounce the years off of an inflation calculator? EMail sent 07/20/20

In [15]:
residential.isnull().sum()

bathrooms             0
heat_d                0
heat_code             0
ac                    0
total_rooms          35
bedrooms              5
most_recent_build     0
remodel_yr            0
stories              37
sale_date             0
price                 0
structure_d           0
structure             0
kitchens              1
fireplaces            2
land_area             0
ac_code               0
structure_code        0
dtype: int64

### Instead of just dropping missing values in features, goign to use multiple imputation, is slight overkill given the large amount of data that would still remain, but it's fun to keep learning packages am i right?

In [18]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=42)

residential_num_imputed = pd.DataFrame(imputer.fit_transform(residential_num), columns = residential_num.columns)


In [20]:
residential_num_imputed.isnull().sum()

bathrooms            0
heat_code            0
total_rooms          0
bedrooms             0
most_recent_build    0
remodel_yr           0
stories              0
price                0
kitchens             0
fireplaces           0
land_area            0
ac_code              0
structure_code       0
dtype: int64

In [21]:
residential = residential_num_imputed