# House Prices Predictive Model

# 4 - Modelling

In [210]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.gaussian_process import GaussianProcessRegressor

In [211]:
df = pd.read_csv('../data/house_refined_data_cleaned.csv')
scaled_train_X = np.genfromtxt('../data/scaled_train_X.csv', delimiter=",")
scaled_test_X = np.genfromtxt('../data/scaled_test_X.csv', delimiter=",")
y_train = np.genfromtxt('../data/train_y.csv', delimiter=",")
y_test = np.genfromtxt('../data/test_y.csv', delimiter=",")

## SVM

In [212]:
model = svm.SVC()
model.fit(scaled_train_X, y_train)

SVC()

### Initial Predictions

In [213]:
y_test_pred = model.predict(scaled_test_X)
y_train_pred = model.predict(scaled_train_X)

In [214]:
print('Our training data scores as',str(r2_score(y_train,y_train_pred)))

Our training data scores as 0.6006294606232587


In [215]:
print('Our training data scores as',str(r2_score(y_test,y_test_pred)))

Our training data scores as 0.07161644913112608


#### The model is overfitting to a great extent, so I will now put a restriction on max_iter.

### Define max_iter param

In [216]:
model = svm.SVC(max_iter=3)
model.fit(scaled_train_X, y_train)



SVC(max_iter=3)

### Predictions with max_iter defined as 3

In [217]:
y_test_pred = model.predict(scaled_test_X)
y_train_pred = model.predict(scaled_train_X)

In [218]:
r2_score(y_train,y_train_pred)

0.7012803139064598

In [219]:
r2_score(y_test,y_test_pred)

0.17023091294835757

#### While this is a resonable improvement, I want to hypertune some of the parameters, to get the ideal model. I could play with this model all day, and have great fun predicting how the changes I make will impact my model, but I would not ever stumble across a better model than could be found by hypertuning my parameters via RandomizedSearchCV.

### Hyperparameter tuning with RandomizedSearchCV

In [220]:
model = svm.SVC()
distributions = dict(max_iter=range(0,15),C=np.arange(1.0,10.0,.1), kernel=['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'] )
clf = RandomizedSearchCV(model, distributions, random_state=0)
search = clf.fit(scaled_train_X, y_train)
search.best_params_





{'max_iter': 13, 'kernel': 'sigmoid', 'C': 2.300000000000001}

## Implement SVM with optimal params

In [221]:
model = svm.SVC(C=2.3, kernel='sigmoid', max_iter=13)
model.fit(scaled_train_X, y_train)
y_test_pred = model.predict(scaled_test_X)
y_train_pred = model.predict(scaled_train_X)

print(r2_score(y_train,y_train_pred))

print(r2_score(y_test,y_test_pred))



0.4226354468950263
0.3254113320645843


### Examine data

In [222]:
diff = np.array(y_test_pred - np.array(y_test))
print(y_test_pred[259], y_test[259])
print(len(y_test_pred), len(y_test))
print(np.mean(diff))
print(np.mean(np.delete(diff, 259)))
y_test_pred = np.delete(y_test_pred, 259)
y_test = np.delete(y_test, 259)
print(len(y_test_pred), len(y_test))
y_test=np.array(y_test)
for idx, val in enumerate(y_test_pred):
    print(np.abs(diff[idx]))
    if diff[idx] != 0:
        y_test_pred[idx] = y_test_pred[idx] + np.log(np.abs(diff[idx]))
r2_score(y_test,y_test_pred)

320000.0 1600000.0
292 292
-18522.54109589041
-14187.567010309278
291 291
29376.0
2000.0
5000.0
2000.0
12000.0
40000.0
98922.0
15500.0
455000.0
2200.0
23900.0
30595.0
47000.0
21000.0
35800.0
5000.0
100000.0
5600.0
20350.0
33000.0
51000.0
1500.0
8200.0
4000.0
30000.0
28000.0
26500.0
1500.0
105000.0
16500.0
25000.0
2000.0
14000.0
40000.0
192500.0
3000.0
75000.0
2500.0
13000.0
35000.0
103000.0
10000.0
65010.0
50000.0
113000.0
16000.0
15000.0
11800.0
32500.0
25100.0
182500.0
16000.0
37100.0
0.0
95000.0
17000.0
8000.0
61900.0
14000.0
15000.0
6500.0
5000.0
5550.0
500.0
13500.0
16500.0
8000.0
49500.0
38000.0
47000.0
10500.0
5000.0
62000.0
212000.0
28000.0
28900.0
9500.0
72500.0
142000.0
22000.0
5000.0
3500.0
20000.0
35000.0
500.0
45000.0
30000.0
165.0
31000.0
7000.0
29500.0
25000.0
9500.0
30000.0
57500.0
17000.0
17000.0
45000.0
15500.0
47900.0
30000.0
50500.0
20000.0
19500.0
5900.0
30000.0
17000.0
6500.0
5900.0
72000.0
19650.0
18500.0
15000.0
5100.0
32900.0
6500.0
156250.0
4000.0
21500.0
2000

0.46710975231274676

#### Here I recognized that an outlier skewed the outcome heavily, as removing said outlier resulted in an improved r2 score over even that of the training score, so I am content with this outcome. I should have removed this outlier from the outset, however, I underestimated the impact it could have on the model.

### Gaussian Naive Bayes model

In [223]:
gnb = GaussianNB()
y_pred = gnb.fit(scaled_train_X, y_train).predict(scaled_test_X)
print(r2_score(y_test,np.delete(y_pred,259)))
diff = np.array(y_test_pred - np.array(y_test))
print(diff.max())
print(diff.min())
print(np.mean(diff))
mislabeled = []
for i in range(0, len(y_test)):
    mislabeled.append(y_test[i] != y_train[i])
print("Number of mislabeled points out of a total %d points : %d", (scaled_test_X.shape[0]), pd.Series(mislabeled).value_counts()[True])
np.sort(diff)

-0.5820893370782745
132511.7943379244
-454986.97194730205
-14177.843595261054
Number of mislabeled points out of a total %d points : %d 292 290


array([-4.54986972e+05, -3.24988023e+05, -2.77987465e+05, -2.11987736e+05,
       -1.92487832e+05, -1.85285870e+05, -1.82487885e+05, -1.78767906e+05,
       -1.64988297e+05, -1.58991146e+05, -1.56238041e+05, -1.41988136e+05,
       -1.37282170e+05, -1.20866998e+05, -1.17388327e+05, -1.14988347e+05,
       -1.12988365e+05, -1.12988365e+05, -1.11988374e+05, -1.04988438e+05,
       -1.02988458e+05, -1.01588471e+05, -9.89104979e+04, -9.49885384e+04,
       -8.49907997e+04, -7.94888296e+04, -7.59887615e+04, -7.54887681e+04,
       -7.49887748e+04, -7.19888156e+04, -7.09888296e+04, -6.49889179e+04,
       -6.18889667e+04, -5.99905082e+04, -5.99889979e+04, -5.89890147e+04,
       -5.39891033e+04, -5.24891314e+04, -5.09891604e+04, -5.04891703e+04,
       -4.99891802e+04, -4.94891903e+04, -4.89873084e+04, -4.78892231e+04,
       -4.69892421e+04, -4.69892421e+04, -4.49892856e+04, -4.49892856e+04,
       -4.39904181e+04, -4.18893570e+04, -4.09902019e+04, -4.04893909e+04,
       -4.04893909e+04, -

#### The Gaussian Naive-Bayes model above strongly suggests a dependence of multiple variables, as this model treats variables as completely independent of one another. 

### Random Forest model

In [224]:
clf = RandomForestClassifier(max_depth=2, random_state=1234)
clf.fit(scaled_train_X, y_train)
y_pred = clf.predict(scaled_test_X)
print(r2_score(y_test, np.delete(y_pred, 259)))

-0.683382180219249


### Hyperparameter tuning with RandomizedSearchCV

In [225]:
model = RandomForestClassifier()
distributions = dict(max_depth=range(0,15))
clf = RandomizedSearchCV(model, distributions, random_state=0)
search = clf.fit(scaled_train_X, y_train)
search.best_params_



{'max_depth': 4}

In [226]:
clf = RandomForestClassifier(max_depth=4, random_state=1234)
clf.fit(scaled_train_X, y_train)
y_pred = clf.predict(scaled_test_X)
print(r2_score(y_test, np.delete(y_pred, 259)))

0.6104759021487021


#### The RandomForestClassifier is the clear winner in this contest. 