In [1]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import linear_rainbow, het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import LabelEncoder

In [2]:
ls

CONTRIBUTING.md    Round 1.ipynb      Round 4.ipynb      Untitled.ipynb
LICENSE.md         Round 2.....ipynb  Round 5.ipynb      [34mdata[m[m/
README.md          Round 3.....ipynb  Round 6.ipynb      halfway-there.gif


In [4]:
data = pd.read_csv("data/Semi-Editted")

In [5]:
data.head()

Unnamed: 0,price,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15
0,221900,10/13/2014,3,1,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,1340,5650
1,538000,12/9/2014,3,2,2570,7242,2,0,0,3,7,2170,400,1951,1991,98125,1690,7639
2,180000,2/25/2015,2,1,770,10000,1,0,0,3,6,770,0,1933,83,98028,2720,8062
3,604000,12/9/2014,4,3,1960,5000,1,0,0,5,7,1050,910,1965,0,98136,1360,5000
4,510000,2/18/2015,3,2,1680,8080,1,0,0,3,8,1680,0,1987,0,98074,1800,7503


In [11]:
data.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,zipcode,sqft_living15,sqft_lot15,age,month_sold,year_sold,renovated,yard_space
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,540296.6,3.3732,1.75066,2080.32185,15099.41,1.445988,0.00676,0.233181,3.409825,7.657915,1788.596842,291.83382,98077.951845,1986.620318,12758.283512,50.000324,6.573969,2014.322962,0.013104,13019.09
std,367368.1,0.926299,0.734042,918.106125,41412.64,0.551658,0.081944,0.764673,0.650546,1.1732,827.759761,437.822457,53.513072,685.230472,27274.44195,29.375234,3.115061,0.467619,0.113721,41263.3
min,78000.0,1.0,0.0,370.0,520.0,1.0,0.0,0.0,1.0,3.0,370.0,0.0,98001.0,399.0,651.0,6.0,1.0,2014.0,0.0,-1900.0
25%,322000.0,3.0,1.0,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,98033.0,1490.0,5100.0,24.0,4.0,2014.0,0.0,3095.0
50%,450000.0,3.0,2.0,1910.0,7618.0,1.0,0.0,0.0,3.0,7.0,1560.0,0.0,98065.0,1840.0,7620.0,46.0,6.0,2014.0,0.0,5710.0
75%,645000.0,4.0,2.0,2550.0,10685.0,2.0,0.0,0.0,4.0,8.0,2210.0,550.0,98118.0,2360.0,10083.0,70.0,9.0,2015.0,0.0,8662.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.0,1.0,4.0,5.0,13.0,9410.0,4820.0,98199.0,6210.0,871200.0,121.0,12.0,2015.0,1.0,1650059.0


In [6]:
data['age'] = 2021 - data.yr_built
data = data.drop(columns=['yr_built'], axis=1)

In [7]:
#Convert date column to 2 separate columns for month and year
date = data['date'].str.split('/', expand=True)
data['month_sold'] = date[0].astype('int64')
data['year_sold'] = date[2].astype('int64')
#Drop original date column
data.drop(columns=['date'], axis=1, inplace=True)

In [8]:
# Let's change yr_renovated to a binary column? Since it's not manadatory to have renovated your home

data.yr_renovated.fillna(0.0, inplace=True)
#Create renovated column
data['renovated'] = data.year_sold - data.yr_renovated
#Replace any values less than 10 with 1, and any values over 10 with 0renovated = df.renovated.values
age = data.age.values
values = np.where(data.renovated <= 10, 1, 0)
data['renovated'] = np.where(age <= 5, 1, values)
#Drop yr_renovated column
data.drop(columns=['yr_renovated'], axis=1, inplace=True)

In [9]:
# sqft_livingsquare  -  footage of the home
# sqft_lotsquare-  footage of the lot

# sqft_above - square footage of house apart from basement
# sqft_basement - square footage of the basement

# sqft_above + sqft_basement = sqft_livingsquare
data['yard_space'] = data.sqft_lot - data.sqft_living

In [12]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,zipcode,sqft_living15,sqft_lot15,age,month_sold,year_sold,renovated,yard_space
0,221900,3,1,1180,5650,1,0,0,3,7,1180,0,98178,1340,5650,66,10,2014,0,4470
1,538000,3,2,2570,7242,2,0,0,3,7,2170,400,98125,1690,7639,70,12,2014,0,4672
2,180000,2,1,770,10000,1,0,0,3,6,770,0,98028,2720,8062,88,2,2015,0,9230
3,604000,4,3,1960,5000,1,0,0,5,7,1050,910,98136,1360,5000,56,12,2014,0,3040
4,510000,3,2,1680,8080,1,0,0,3,8,1680,0,98074,1800,7503,34,2,2015,0,6400
