In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel("real_estate_withheaders.xls")

In [3]:
df.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price
0,7129300520,2014-10-13,3,1.0,1180,5650,1.0,0,0,3,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900
1,6414100192,2014-12-09,3,2.25,2570,7242,2.0,0,0,3,...,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000
2,5631500400,2015-02-25,2,1.0,770,10000,1.0,0,0,3,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000
3,2487200875,2014-12-09,4,3.0,1960,5000,1.0,0,0,5,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000
4,1954400510,2015-02-18,3,2.0,1680,8080,1.0,0,0,3,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000


In [4]:
#creating a correlation matrix with absolute values
corr_matrix = df.corr().abs()

In [5]:
corr_matrix

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price
id,1.0,0.00115,0.005162,0.012241,0.131911,0.018608,0.002727,0.011536,0.023803,0.008188,0.010799,0.005193,0.021617,0.016925,0.008211,0.001798,0.020672,0.002701,0.138557,0.016772
bedrooms,0.00115,1.0,0.514508,0.578212,0.032471,0.177944,0.006834,0.080008,0.026496,0.356563,0.479386,0.302808,0.15567,0.018389,0.154092,0.009951,0.132054,0.393406,0.03069,0.308787
bathrooms,0.005162,0.514508,1.0,0.755758,0.088373,0.502582,0.063744,0.188386,0.126479,0.665838,0.686668,0.28344,0.507173,0.050544,0.204786,0.02428,0.224903,0.569884,0.088303,0.525906
sqft_living,0.012241,0.578212,0.755758,1.0,0.173453,0.353953,0.103854,0.284709,0.059445,0.762779,0.876448,0.43513,0.318152,0.055308,0.199802,0.052155,0.241214,0.756402,0.184342,0.701917
sqft_lot,0.131911,0.032471,0.088373,0.173453,1.0,0.004814,0.021632,0.0749,0.00883,0.114731,0.184139,0.015418,0.052946,0.007686,0.129586,0.085514,0.230227,0.144763,0.718204,0.089876
floors,0.018608,0.177944,0.502582,0.353953,0.004814,1.0,0.023755,0.028814,0.264075,0.458794,0.523989,0.245715,0.489193,0.006427,0.059541,0.049239,0.125943,0.280102,0.010722,0.256804
waterfront,0.002727,0.006834,0.063744,0.103854,0.021632,0.023755,1.0,0.401971,0.016611,0.082888,0.072109,0.080559,0.026153,0.092873,0.030272,0.014306,0.041904,0.086507,0.030781,0.266398
view,0.011536,0.080008,0.188386,0.284709,0.0749,0.028814,0.401971,1.0,0.045999,0.251728,0.167609,0.277078,0.053636,0.103951,0.084622,0.005871,0.078107,0.280681,0.072904,0.39737
condition,0.023803,0.026496,0.126479,0.059445,0.00883,0.264075,0.016611,0.045999,1.0,0.146896,0.158904,0.173849,0.361592,0.060788,0.002888,0.015102,0.105877,0.093072,0.003126,0.036056
grade,0.008188,0.356563,0.665838,0.762779,0.114731,0.458794,0.082888,0.251728,0.146896,1.0,0.756073,0.16822,0.447865,0.014261,0.185771,0.113575,0.200341,0.713867,0.120981,0.667951


In [6]:
price_impact = corr_matrix['price']

In [7]:
price_impact

id               0.016772
bedrooms         0.308787
bathrooms        0.525906
sqft_living      0.701917
sqft_lot         0.089876
floors           0.256804
waterfront       0.266398
view             0.397370
condition        0.036056
grade            0.667951
sqft_above       0.605368
sqft_basement    0.323799
yr_built         0.053953
yr_renovated     0.126424
zipcode          0.053402
lat              0.306692
long             0.022036
sqft_living15    0.585241
sqft_lot15       0.082845
price            1.000000
Name: price, dtype: float64

In [8]:
df = df.drop(columns = ['id', 'long', 'sqft_above'])

In [9]:
df.shape

(21597, 18)

## 1 Feature engineering

In [10]:
# get a sample to check the format of datetime

df['date'][1]

Timestamp('2014-12-09 00:00:00')

In [11]:
df['month'] = pd.DatetimeIndex(df['date']).month # extracting month

In [12]:
# define a function to check if there is a season spike in prices

def season(x):
    if x < 4:
        return 'winter'
    elif x < 10:
        return 'summer'
    else:
        return 'winter'

In [13]:
df['season'] = df['month'].apply(season)

In [14]:
df = df.drop(columns = ['date','month'])

In [15]:
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,yr_built,yr_renovated,zipcode,lat,sqft_living15,sqft_lot15,price,season
0,3,1.0,1180,5650,1.0,0,0,3,7,0,1955,0,98178,47.5112,1340,5650,221900,winter
1,3,2.25,2570,7242,2.0,0,0,3,7,400,1951,1991,98125,47.721,1690,7639,538000,winter
2,2,1.0,770,10000,1.0,0,0,3,6,0,1933,0,98028,47.7379,2720,8062,180000,winter
3,4,3.0,1960,5000,1.0,0,0,5,7,910,1965,0,98136,47.5208,1360,5000,604000,winter
4,3,2.0,1680,8080,1.0,0,0,3,8,0,1987,0,98074,47.6168,1800,7503,510000,winter


In [16]:
#year built - more useful to know the age of the property - bucketed into age groups
df['yr_built'][1]

1951

In [24]:
def era(x):
    if x < 1930:
        return 'pre 1930'
    elif x >= 1930 and x < 1980:
        return 'pre 1980'
    else:
        return 'post 1980'

In [25]:
df['built_era'] = list(map(era, df['yr_built']))

In [27]:
df['age'] = 2015 - df['yr_built']

In [28]:
df = df.drop(columns = ['yr_built']) 

In [29]:
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,yr_renovated,zipcode,lat,sqft_living15,sqft_lot15,price,season,built_era,age
0,3,1.0,1180,5650,1.0,0,0,3,7,0,0,98178,47.5112,1340,5650,221900,winter,pre 1980,60
1,3,2.25,2570,7242,2.0,0,0,3,7,400,1991,98125,47.721,1690,7639,538000,winter,pre 1980,64
2,2,1.0,770,10000,1.0,0,0,3,6,0,0,98028,47.7379,2720,8062,180000,winter,pre 1980,82
3,4,3.0,1960,5000,1.0,0,0,5,7,910,0,98136,47.5208,1360,5000,604000,winter,pre 1980,50
4,3,2.0,1680,8080,1.0,0,0,3,8,0,0,98074,47.6168,1800,7503,510000,winter,post 1980,28
