In [86]:
# import needed libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from scipy import stats

# 1. Data pre-processing and Preview

In [87]:
original_data = pd.read_csv('data.csv')

In [88]:
original_data.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


## 1.2 dropping some columns
drop these column: date, country, street
- date and country are the same for all rows
- street has high cardinality, so we just skip for now

In [89]:
data = original_data.drop(['date', 'street', 'country'], axis=1)

In [90]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,statezip
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,Shoreline,WA 98133
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,Seattle,WA 98119
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,Kent,WA 98042
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,Bellevue,WA 98008
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,Redmond,WA 98052


In [91]:
from sklearn.feature_selection import mutual_info_regression
x = list(data.columns[1:])
y = data['price'].values.reshape(-1, 1)
result = {}
for col in x:    
    if data[col].dtype != np.float64 and data[col].dtype != np.int64:
        x_factorized, _ = data[col].factorize() 
        m = mutual_info_regression(y, x_factorized , random_state=0)
    else:
        m = mutual_info_regression(y, data[col], random_state=0)
    result[col] = m
    print(col)
    print(m)
    print()

bedrooms
[0.06749758]

bathrooms
[0.19192697]

sqft_living
[0.33635382]

sqft_lot
[0.08510813]

floors
[0.06542467]

waterfront
[0.00367717]

view
[0.05286753]

condition
[0]

sqft_above
[0.24520199]

sqft_basement
[0.06945182]

yr_built
[0.06702013]

yr_renovated
[0]

city
[0.29930583]

statezip
[0.40274408]



In [92]:
threshold = 0.05
new_columns = []
for col,mir in result.items():
    if mir > threshold:
        new_columns.append(col)
new_columns.append('price')
data = data[new_columns]
new_columns

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'view',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'city',
 'statezip',
 'price']

In [93]:
data.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,sqft_above,sqft_basement,yr_built,city,statezip,price
0,3.0,1.5,1340,7912,1.5,0,1340,0,1955,Shoreline,WA 98133,313000.0
1,5.0,2.5,3650,9050,2.0,4,3370,280,1921,Seattle,WA 98119,2384000.0
2,3.0,2.0,1930,11947,1.0,0,1930,0,1966,Kent,WA 98042,342000.0
3,3.0,2.25,2000,8030,1.0,0,1000,1000,1963,Bellevue,WA 98008,420000.0
4,4.0,2.5,1940,10500,1.0,0,1140,800,1976,Redmond,WA 98052,550000.0


## 1.3 Oulier detection with Z-score

In [94]:
data.shape

(4600, 12)

In [95]:
columns = list(data.columns)
rows_to_drop = []
for col in columns:
    if data[col].dtype != np.float64 and data[col].dtype != np.int64:
        continue
    z_scores = stats.zscore(data[col])
    for i in range(len(z_scores)):
        if abs(z_scores[i]) > 3:
            rows_to_drop.append(i)

rows_to_drop = sorted(list(set(rows_to_drop)))
data = data.drop(rows_to_drop)

In [96]:
data.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,sqft_above,sqft_basement,yr_built,city,statezip,price
0,3.0,1.5,1340,7912,1.5,0,1340,0,1955,Shoreline,WA 98133,313000.0
2,3.0,2.0,1930,11947,1.0,0,1930,0,1966,Kent,WA 98042,342000.0
3,3.0,2.25,2000,8030,1.0,0,1000,1000,1963,Bellevue,WA 98008,420000.0
4,4.0,2.5,1940,10500,1.0,0,1140,800,1976,Redmond,WA 98052,550000.0
5,2.0,1.0,880,6380,1.0,0,880,0,1938,Seattle,WA 98115,490000.0


In [97]:
data.shape

(4247, 12)

## 1.4 Get Features and Dependent Variable

In [98]:
# seperate the Dependent (y) and Independent (X) variables

y = data.iloc[:, -1].values # price column
X = data.iloc[:, :-1].values

## 1.5 Encoding

In [99]:
# City and Statezip are categorical. 

state_zip_index = list(data.columns).index("statezip")
city_index = list(data.columns).index("city")
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [city_index, state_zip_index])], remainder='passthrough')
X = ct.fit_transform(X)

## 1.6 Split data

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=1)

# 2. MultiLinear Regression

In [101]:
mlr = LinearRegression()
mlr.fit(X_train, y_train)
y_pred_mlr = mlr.predict(X_test)
r2_mlr = r2_score(y_test, y_pred_mlr)
mae_mlr = mean_absolute_error(y_test, y_pred_mlr)
print("R2 score for multi linear Regression:  {}".format(r2_mlr))
print("Mean Absolute Error for multi linear Regression:  {}".format(mae_mlr))

R2 score for multi linear Regression:  0.7572330052673701
Mean Absolute Error for multi linear Regression:  82862.86077849456


# 3. Decision Tree Regression

In [102]:
dt_reg = DecisionTreeRegressor(random_state=1)
dt_reg.fit(X_train, y_train)
y_pred_dt = dt_reg.predict(X_test)
r2_dt = r2_score(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
print("R2 score for Decision Tree Regression:  {}".format(r2_dt))
print("Mean Absolute Error for Decision Tree Regression:  {}".format(mae_dt))

R2 score for Decision Tree Regression:  0.3318256601048306
Mean Absolute Error for Decision Tree Regression:  127437.63392190394
