In [1]:
import numpy as np
import pandas as pd
import pickle
import numbers
import statsmodels.api as sm
import random
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from tkinter import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
pd.options.display.max_seq_items = 2000

  from pandas.core import datetools


For earlier analysis, we took the log of the endprice, this time however, we are interested in the effect of certain variables on  the real endprice.

In [2]:
df = pickle.load(open('houses.p','rb'))
df['endprice'] = np.exp(df['endprice'])

We start with a few comparable features, we should add some more in the future

In [4]:
compare_features = ['bathroom.badkamer','feature.zwembad','bathroom.aparte toilet','balcony.balkon','feature.sauna']

In [5]:
df[compare_features].isnull().sum()

bathroom.badkamer         0
feature.zwembad           0
bathroom.aparte toilet    1
balcony.balkon            0
feature.sauna             0
dtype: int64

In [6]:
df = df.dropna(subset=compare_features)

In [7]:
weights = {"volume" : 0.275818,
            "livingspace" : 0.251524,
            "Gemiddelde woningwaarde:x 1 000 euro" : 0.113488,
#            "housetype" : 0.054045,
           "V1.x":0.036266,
           "lotsurface":0.029131,
           "yearofconstruction":0.020824,
           "longitude":0.017308,
           'latitude':0.012521}
#            'housesubtype':0.011934}
rcf = list(weights.keys())

In [8]:
len(df)

78294

In [9]:
df[rcf].isnull().sum()

volume                                     49
livingspace                              4575
Gemiddelde woningwaarde:x 1 000 euro     1866
V1.x                                        0
lotsurface                              24943
yearofconstruction                       4288
longitude                                   0
latitude                                    0
dtype: int64

In [10]:
weights = {"volume" : 0.275818,
            "livingspace" : 0.251524,
            "Gemiddelde woningwaarde:x 1 000 euro" : 0.113488,
#            "housetype" : 0.054045,
           "V1.x":0.036266,
           "lotsurface":0.029131,
           "yearofconstruction":0.020824,
           "longitude":0.017308,
           'latitude':0.012521}
#            'housesubtype':0.011934}

#Normalize weights 
sum_vals = sum(weights.values())
for key in weights.keys():
    weights[key] = weights[key] / sum_vals
    
rcf = list(weights.keys())
#ccf = ["housesubtype","housetype"]
#ccf = ['feature.zwembad','feature.sauna', 'feature.glasvezelkabel']
#       'feature.stoomcabine', 'feature.stoomcabine','feature.jacuzzi']

In [11]:
df[df['lotsurface'].isnull()].housetype.value_counts()

appartement        19794
eengezinswoning     4409
herenhuis            310
villa                168
bungalow             146
woonboerderij         65
landhuis              28
woonboot              14
grachtenpand           8
stacaravan             1
landgoed               0
Name: housetype, dtype: int64

The column with the most missing values is "lotsurface". Looking at how these missing values are centered, we see that most of these missing values happen for appartments (which is to be expected). As it turns out, all appartments have lotsurface Nan. For the purpose of the regression we put this value to 0 for all appartments.

For all the other values, in order to empower regression, we could either drop the row or set the value to the average of the housetype. Since there appears to be plenty of data without Nans and since we want to use mostly accurate data, we decide to drop them.

In [12]:
df['lotsurface'] = np.where(df['housetype'] == 'appartement', 0, df['lotsurface'])
df = df.dropna(subset=rcf)

In [13]:
# weights --> dictionary of feature name to importance
# data --> pandas datafram of houses with all relevant features
# rcf --> list of all relevant features 
# puf --> list of all potentially upgradable features -should be a subset of relevant features 
# thres --> maximum distance a house can have from the target house if it is to be put in a reference set 
# target_house

def normalize_column(x):
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return x_scaled, min_max_scaler

def weightedL2(a,b,w):
    q = a-b
    return np.sqrt((w*q*q).sum())

# rcf --> relevent comparable features rcfv--> relevant comparable feature values puf --> potential upgradable features
def get_reference_indices(weights, data, thres, target_values):
    reference_indices = []
    for i, house in enumerate(data):
        if weightedL2(target_values, house,weights) <  thres:
            reference_indices.append(i)
    return reference_indices

def get_reference_sets(weights, rcf, ccf, data, target_house, thres=1):
    df_ans = data.copy()
    df = data[rcf+ccf] # We only look only at the columns that were determined to be relevant for comparison
    t_house = target_house[rcf+ccf]
#     for col in ccf:
#         val = t_house[col].iloc[0]
#         df = df[df[col]==val]
    for col in df[rcf].columns:
        if (df[col].dtype in [bool,float,int,np.int64]):
            a, b = normalize_column(df[[col]].values.astype(float))
            df[col] = a
            t_house[col] = b.transform(t_house[[col]].values.astype(float))
    target_values = t_house[rcf].values
    data_array = df[rcf].values
#     return target_values
    weights_arr = np.array(list(weights.copy().values()))
    reference_index = get_reference_indices(weights_arr, data_array, thres, target_values)
    return df_ans.iloc[reference_index]

In [14]:
# thresh is a measure of how close a house has to be to the target house in order for it to be similar
thresh = 1
t_house = pd.DataFrame(df.iloc[2]).T

In [15]:
ref_sets = get_reference_sets(weights=weights, rcf = rcf, ccf = compare_features, data= df, target_house= t_house,thres=thresh)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
def feature_importance (reference_sets, rcf, compare_features):
    #This function will fit a simple regression in the data and return the feature importances for each upgradable feature
    # It will also return the test R2 score
    #drop NaN values #note we took the easy approach and dropped everything 
    df_result = reference_sets[rcf + compare_features + ['endprice']].dropna()
    #scaling the data
#     scaler = StandardScaler()
#     scaler.fit (df_result[rcf + compare_features])
    #creating X,y and spliting the dataset into train and test set
#     X = scaler.transform(df_result[rcf + compare_features])
    X = df_result[rcf + compare_features]
    y = df_result.endprice
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    #fitting the regression
    reg = LinearRegression().fit(X_train, y_train)
    score = reg.score(X_test, y_test)
    feature_imp = pd.Series(data=reg.coef_, index = rcf+compare_features)
    return (score,feature_imp)

We can use the standard regression we always use from scikit learn but it's hard to gain insights from that.

In [17]:
score, importance_df = feature_importance (ref_sets, rcf, compare_features)
print ("Regression R2 score : ", score,'\n\nfeature importances : \n',importance_df)

Regression R2 score :  0.6761654464711715 

feature importances : 
 volume                                      69.318784
livingspace                               1832.048245
Gemiddelde woningwaarde:x 1 000 euro       655.444015
V1.x                                        -0.943188
lotsurface                                   3.714039
yearofconstruction                         -87.390065
longitude                                -9038.785285
latitude                                  9634.327674
bathroom.badkamer                        18727.837295
feature.zwembad                         182834.873036
bathroom.aparte toilet                    5215.037688
balcony.balkon                           16722.373716
feature.sauna                            21219.125828
dtype: float64


We can also use another package, which has slightly lower R-squared but does give insight into the impact of all the variables. This is similar to what we see for STATA during the lecture.

In [18]:
data = ref_sets[rcf + compare_features + ['endprice']].dropna()
data = data.astype(float)
X = np.asarray(data[rcf + compare_features])
y = np.asarray(data['endprice'])
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.660
Model:                            OLS   Adj. R-squared:                  0.660
Method:                 Least Squares   F-statistic:                     9874.
Date:                Sat, 01 Dec 2018   Prob (F-statistic):               0.00
Time:                        22:53:25   Log-Likelihood:            -8.5760e+05
No. Observations:               66239   AIC:                         1.715e+06
Df Residuals:                   66225   BIC:                         1.715e+06
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3.684e+05    4.9e+04     -7.520      0.0

In [19]:
# This function returns the r-squared, as well as the lower bound of the 95% confidence interval 
# and the upper bound of the 95% confidence interval.
def evaluate_ols(reference_set, rel_features, comp_features):
        data = reference_set[rel_features + comp_features + ['endprice']]
        data = data.astype(float)
        X = np.asarray(data[rel_features + comp_features])
        y = np.asarray(data['endprice'])
        X2 = sm.add_constant(X)
        est = sm.OLS(y, X2)
        est2 = est.fit()
        r = est2.rsquared
        lb = np.array(est2.conf_int())[:,0]
        ub = np.array(est2.conf_int())[:,1]
        return r, lb, ub
    
def evaluate_model(weights, rel_feats, comp_feats, data, target_houses, thres=1):
    df = data.copy()
    r_squareds = 0
    lbs = np.zeros(len(rel_feats + comp_feats + ['endprice']))
    ubs = np.zeros(len(rel_feats + comp_feats + ['endprice']))
    for i, row in target_houses.iterrows():
        t_house = target_houses[target_houses.index.isin([i])]
        ref_set = get_reference_sets(weights, rel_feats, comp_feats, df, t_house, thres=1)
        r, lb, ub = evaluate_ols(ref_set, rel_feats, comp_feats)
        r_squareds += r
        lbs += lb
        ubs += ub
    r_avg = r_squareds/len(target_houses)
    lb_avg = lbs/len(target_houses)
    ub_avg = ubs/len(target_houses)
    return r_avg, lb_avg, ub_avg

def run_evaluation(t_houses_size, weights, rel_feats, comp_feats, data, thres=1):
    t_houses = df.loc[random.sample(list(df.index), t_houses_size)]
    ans = evaluate_model(weights, rcf, compare_features, df, t_houses)
    print("Average r-squared for {} inputs is: {}".format(t_houses_size, ans[0]))
    for i, col in enumerate(rel_feats + comp_feats + ['endprice']):
        print("The column {} has an average 95% confidence interval between {} and {}".format(col, ans[1][i], ans[2][i]))
        print("\n")

By running the model a (relatively large) number of times we can see how well our predictions behave.

In [21]:
run_evaluation(100, weights, rcf, compare_features, df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Average r-squared for 100 inputs is: 0.6596574939811649
The column volume has an average 95% confidence interval between -464405.25558135915 and -272371.287924488


The column livingspace has an average 95% confidence interval between 70.1585227986871 and 84.49727582820016


The column Gemiddelde woningwaarde:x 1 000 euro has an average 95% confidence interval between 1764.9615442611396 and 1829.410448235439


The column V1.x has an average 95% confidence interval between 642.722850780081 and 662.7780143291016


The column lotsurface has an average 95% confidence interval between -1.0205354211034416 and -0.9057047854534556


The column yearofconstruction has an average 95% confidence interval between 3.3541411815716464 and 4.274251091949552


The column longitude has an average 95% confidence interval between -93.36301956000638 and -49.875979199976335


The column latitude has an average 95% confidence interval between -10353.060425581614 and -6700.2437350273485


The column bathroom.b