In [1]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import linear_rainbow, het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import LabelEncoder

In [2]:
ls

CONTRIBUTING.md    Round 1.ipynb      Round 4.ipynb      halfway-there.gif
LICENSE.md         Round 2.....ipynb  Untitled.ipynb
README.md          Round 3.....ipynb  [34mdata[m[m/


In [3]:
data = pd.read_csv("data/cleaned_housing_data.csv")

In [4]:
display(data.shape)
data.head()

(15762, 21)

Unnamed: 0.1,Unnamed: 0,price,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1,538000.0,2014-12-09,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
1,3,604000.0,2014-12-09,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
2,4,510000.0,2015-02-18,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503
3,5,1230000.0,2014-05-12,4,4.5,5420,101930,1.0,0.0,0.0,...,11,3890,1530.0,2001,0.0,98053,47.6561,-122.005,4760,101930
4,6,257500.0,2014-06-27,3,2.25,1715,6819,2.0,0.0,0.0,...,7,1715,?,1995,0.0,98003,47.3097,-122.327,2238,6819


In [None]:
def bootstrap(df,column,target,sample_size=300):
    unique = df[column].unique()
    for val in unique:
        samples = []
        frame = df[df[column] == val]
        for i in range(sample_size):
            sample = np.random.choice(frame[target], size = frame.shape[0], replace = True)
            samples.append(sample.mean())
        plt.hist(samples, label = val, alpha =.7)
    plt.legend();

In [None]:
plt.figure(figsize = (15,6))
frame = data[data.bedrooms>4]
bootstrap(frame,'bedrooms','price')

In [5]:
data = data.drop(columns=['Unnamed: 0'])

In [None]:
data = data.drop(columns=['date'])

In [None]:
data = data.drop(columns=['lat','long'])

In [6]:
data.dtypes

price            float64
date              object
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront       float64
view             float64
condition          int64
grade              int64
sqft_above         int64
sqft_basement     object
yr_built           int64
yr_renovated     float64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [7]:
errors = []
for idx in data.index:
    try: 
        float(data.yr_renovated[idx])
    except:
        errors.append(idx)
        
data.iloc[errors].yr_renovated.value_counts()

Series([], Name: yr_renovated, dtype: int64)

In [8]:
data = data[data.sqft_basement != '?']

In [9]:
data['price'] = data['price'].astype(int)
data['bathrooms'] = data['bathrooms'].astype(int)
data['floors'] = data['floors'].astype(int)
data['waterfront'] = data['waterfront'].astype(int)
data['view'] = data['view'].astype(int)

In [10]:
data['sqft_basement'] = pd.to_numeric(data.sqft_basement, errors="coerce")
data['yr_renovated'] = pd.to_numeric(data.yr_renovated, errors="coerce")

In [11]:
data['sqft_basement'] = data['sqft_basement'].astype(int)
data['yr_renovated'] = data['yr_renovated'].astype(int)

In [12]:
data.to_csv('final_clean_housing_data.csv', index = False)

### Outliers

For each column, it first computes the Z-score of each value in the column, relative to the column mean and standard deviation.

It then takes the absolute Z-score because the direction does not matter, only if it is below the threshold.
all(axis=1) ensures that for each row, all column satisfy the constraint.

Finally, the result of this condition is used to index the dataframe.

In [None]:
df = pd.DataFrame(data[data.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)])

In [None]:
df.describe()

In [None]:
df_corr = df.corr()
df_corr

In [None]:
mask = np.triu(np.ones_like(df_corr, dtype=np.bool))

fig1, ax1 = plt.subplots(figsize=(11, 9))
sns.heatmap(df_corr, mask=mask, ax=ax1, cmap="viridis");

In [None]:
df[['price','sqft_above', 'sqft_living','grade','sqft_living15', 'bathrooms']].hist(figsize  = [6, 6]); 

### Let's use the following columns: sqft_livingsquare, sqft_above, sqft_lotsquare, grade, sqft_living15 and bathrooms

In [None]:
model1_cols = ['price','sqft_above', 'sqft_living','grade','sqft_living15', 'bathrooms']
model1 = df[model1_cols]
sns.pairplot(model1);

# Model 1 Evaluation:

In [None]:
fsm_df1 = model1.copy()
fsm_df1.dropna(inplace=True)

In [None]:
fsm1 = ols(formula ="price ~ sqft_above + sqft_living + grade + sqft_living15 + bathrooms", data=fsm_df1)
fsm_results1 = fsm1.fit()

In [None]:
fsm_results1.summary()

#  Model 1, Evaluation:

We are explaining about 47% of the variance in home price, with 5 features so far and it's statistically significant.

According to our model:

 - A home price is expected to have an average price of ????
 
 - For each additional unit of square footage of house apart from basement, we expect the home price to decrease by 58.7990
 
 - For each additional unit of footage of the home, we expect the home price to increase by 139.3788
 
 - For each additional unit improvement of the overall grade given to the housing unit, we expect the home price to increase by 90,520 
 
 - For each additional unit of square footage, of interior housing living space for the nearest 15 neighbors, we expect the home price to increase by 36.1389
 
 - For each additional bathrooms, we expect the home price to decrease by -1.363e+04

## Model 1 - Linearity Check

In [None]:
rainbow_statistic, rainbow_p_value = linear_rainbow(fsm_results1)
print("Rainbow statistic:", rainbow_statistic)
print("Rainbow p-value:", rainbow_p_value)

In [None]:
fig = sm.graphics.plot_partregress_grid(fsm_results1)
fig.tight_layout(pad=1.0)

##### The null hypothesis is that the model is linearly predicted by the features, alternative hypothesis is that it is not. 

##### Thus returning a p-value above .05 means that the current model meets the linearity assumption.

## Model 1 - Normality Check

In [None]:
# Jarque-Bera (JB): 6482.361
# Prob(JB): 0.00

#### Linear regression assumes that the residuals are normally distributed. The null hypothesis is that the residuals are normally distributed, alternative hypothesis is that they are not. 

#### Thus returning a low p-value means that the current model violates the normality assumption.

## Homoscadasticity for Model 1, Pair 1:

Linear regression assumes that the variance of the dependent variable is homogeneous across different value of the independent variable(s). 

We can visualize this by looking at the predicted life expectancy vs. the residuals.

The null hypothesis is homoscedasticity, alternative hypothesis is heteroscedasticity. 

Thus returning a low p-value means that the current model violates the homoscedasticity assumption

In [None]:
y = fsm_df1["price"]
y_hat = fsm_results1.predict()

In [None]:
fig2, ax2 = plt.subplots()
ax2.set(xlabel="Predicted Home Price",
        ylabel="Residuals (Predicted - Actual Home Price)")
ax2.scatter(x = y_hat, y = y_hat-y, color="blue", alpha=0.2);

In [None]:
lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(y-y_hat, fsm_df1[['sqft_above', 'sqft_living','grade','sqft_living15', 'bathrooms']])
print("Lagrange Multiplier p-value:", lm_p_value)
print("F-statistic p-value:", f_p_value)

The null hypothesis is homoscedasticity, alternative hypothesis is heteroscedasticity. 

Thus returning a low p-value means that the current model violates the homoscedasticity assumption

## Independence for Model 1:

You might have noticed in the regression output that there was a warning about the condition number being high. 

The condition number is a measure of stability of the matrix used for computing the regression and a number above 30 can indicate strong multicollinearity. 

Our output was 2.94e+04

In [None]:
rows = fsm_df1[['sqft_above', 'sqft_living','grade','sqft_living15', 'bathrooms']].values

vif_df = pd.DataFrame()
vif_df["VIF"] = [variance_inflation_factor(rows, i) for i in range(5)]
vif_df["feature"] = ['sqft_above', 'sqft_living','grade','sqft_living15', 'bathrooms']

vif_df

##### A "rule of thumb" for VIF is that 5 is too high, so I think it's reasonable to say that we are violating the independence assumption, despite the high condition number.

# Summary for Model 1:

START ...