In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
path = "data/kc_house_data.csv"

In [2]:
df = pd.read_csv(path, parse_dates=["date"])

In [3]:
df.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


In [4]:
def get_features(data):
    corr = data.corr()['price']
    for lst in corr:
        if lst > 0.4:
            print(lst)
        else:
            pass

In [5]:
dd = get_features(df)

1.0
0.525137505414
0.702035054612
0.66743425602
0.605567298356
0.58537890358


In [6]:
dd

In [7]:
import numpy as np 
corr = df.corr()["price"]
corr[np.argsort(corr, axis=0)[::-1]] 

price            1.000000
sqft_living      0.702035
grade            0.667434
sqft_above       0.605567
sqft_living15    0.585379
bathrooms        0.525138
view             0.397293
sqft_basement    0.323816
bedrooms         0.308350
lat              0.307003
waterfront       0.266369
floors           0.256794
yr_renovated     0.126434
sqft_lot         0.089661
sqft_lot15       0.082447
yr_built         0.054012
condition        0.036362
long             0.021626
id              -0.016762
zipcode         -0.053203
Name: price, dtype: float64

In [8]:
not_include = ['id', 'zipcode', 'long', 'condition']
features = list(df.columns)

In [9]:
rms = [1,2,4,5,6,7,8]

In [10]:
def remove(lst):
    x = lst
    for i in x:    
        if i > 6:
            print(i)
        else:
            pass

In [11]:
remove(rms)

7
8


In [12]:
# we can use the important features to train our model
X = df[['sqft_above', 'sqft_living', 'grade', 'sqft_living15', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'lat', 'waterfront', 'floors']]

In [13]:
# assign price column in our target output
y = df['price'] 

In [14]:
# our X only holds variables that are related to 'price'
X.head()

Unnamed: 0,sqft_above,sqft_living,grade,sqft_living15,bathrooms,view,sqft_basement,bedrooms,lat,waterfront,floors
0,1180,1180,7,1340,1.0,0,0,3,47.5112,0,1.0
1,2170,2570,7,1690,2.25,0,400,3,47.721,0,2.0
2,770,770,6,2720,1.0,0,0,2,47.7379,0,1.0
3,1050,1960,7,1360,3.0,0,910,4,47.5208,0,1.0
4,1680,1680,8,1800,2.0,0,0,3,47.6168,0,1.0


In [15]:
# Splitting our data in subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


In [16]:
# using 'RobustScaler' makes our data scaled in a robust way... depends on your target values
#from sklearn.preprocessing import RobustScaler
#scaler = RobustScaler()
#robust_scaled_df = scaler.fit_transform(X_train)
#robust_scaled_df = pd.DataFrame(robust_scaled_df)

In [17]:
#robust_scaled_df.shape

In [18]:
#robust_scaled_Test = scaler.fit_transform(X_test)
#robust_scaled_Test = pd.DataFrame(robust_scaled_Test)

In [19]:
#robust_scaled_Test.shape

In [20]:
X_train.head()

Unnamed: 0,sqft_above,sqft_living,grade,sqft_living15,bathrooms,view,sqft_basement,bedrooms,lat,waterfront,floors
1468,1140,1390,7,1630,1.5,0,250,4,47.7224,0,1.0
15590,1450,1450,7,1440,1.5,0,0,3,47.7725,0,1.0
18552,2860,2860,9,2980,2.75,0,0,5,47.7082,0,2.0
10535,1050,1050,7,1650,1.0,0,0,2,47.5736,0,1.0
1069,1240,1240,7,3050,1.0,0,0,2,47.597,0,1.0


In [21]:
# Random forest are an ensemble learning method for classification, regression and other tasks.
# Random decision forests correct for decision trees' habit of overfitting to their training set.
# if
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=20, oob_score=True, random_state= 0)

In [22]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

In [23]:
from sklearn.metrics import accuracy_score
predicted = rf.predict(X_test)
print('score estimate: ', round(rf.oob_score_, 3))
print('Mean accuracy score: ', round(rf.score(X_test, y_test), 3))
# we got score of 81% on our testing data, it is really good, but we can do better. If you notice that, we are only using 
#... one parameter tuning in our model which is "n_estimators", next we'll use gridsearch to find best parameter for this model


score estimate:  0.769
Mean accuracy score:  0.808


In [26]:
# Setting parameters. Remember each model in sklearn has different parameter tuning, for example, logistic regression has two
#... well used parameter, 'penalty' for regularization which it has two options 'l1' or 'l2' and the second parameter
#... in logistic regression is 'C' parameter.
# But here, we are using 'randomForestRegression' which has few parameters we need to choose.
n_estimators = [20, 30, 60]
criterion = ['mse', 'mae']
max_features = ['auto','sqrt']
max_depth = [4,6]
n_jobs = [-1]

# Create hyperparameter options
hyperparameters = dict(n_estimators=n_estimators, 
                       criterion=criterion,
                       max_features=max_features,
                       max_depth=max_depth,
                       n_jobs=n_jobs)

In [27]:
# Create grid search using 10-fold cross validation
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=5)

In [28]:
# Fit grid search - NOTE: it will take time to train the data, start cv=5
best_model = clf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1 
[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1, score=0.6777491369826436, total=   0.4s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1, score=0.683759241594382, total=   0.2s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1, score=0.6944302197729606, total=   0.1s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s remaining:    0.0s


[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1, score=0.7152248834612933, total=   0.1s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.7s remaining:    0.0s


[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=20, n_jobs=-1, score=0.7046532736179048, total=   0.2s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1 
[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1, score=0.6778931002658314, total=   0.3s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1 
[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1, score=0.6822471596404837, total=   0.3s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1 
[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1, score=0.6942168342189414, total=   0.3s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1 
[CV]  criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs=-1, score=0.7181088886355433, total=   0.3s
[CV] criterion=mse, max_depth=4, max_features=auto, n_estimators=30, n_jobs

[CV]  criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1, score=0.7243264489523735, total=   0.2s
[CV] criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1 
[CV]  criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1, score=0.7219476686697204, total=   0.2s
[CV] criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1 
[CV]  criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1, score=0.7277547179092365, total=   0.2s
[CV] criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1 
[CV]  criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1, score=0.7592985857974074, total=   0.2s
[CV] criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1 
[CV]  criterion=mse, max_depth=6, max_features=sqrt, n_estimators=20, n_jobs=-1, score=0.745947209712365, total=   0.2s
[CV] criterion=mse, max_depth=6, max_features=sqrt, n_estimators=30, n_jobs=

[CV]  criterion=mae, max_depth=4, max_features=sqrt, n_estimators=60, n_jobs=-1, score=0.6362595668235353, total=  15.8s
[CV] criterion=mae, max_depth=4, max_features=sqrt, n_estimators=60, n_jobs=-1 
[CV]  criterion=mae, max_depth=4, max_features=sqrt, n_estimators=60, n_jobs=-1, score=0.6209940110496017, total=  17.3s
[CV] criterion=mae, max_depth=4, max_features=sqrt, n_estimators=60, n_jobs=-1 
[CV]  criterion=mae, max_depth=4, max_features=sqrt, n_estimators=60, n_jobs=-1, score=0.6630740104894803, total=  16.7s
[CV] criterion=mae, max_depth=4, max_features=sqrt, n_estimators=60, n_jobs=-1 
[CV]  criterion=mae, max_depth=4, max_features=sqrt, n_estimators=60, n_jobs=-1, score=0.6455358598426784, total=  15.3s
[CV] criterion=mae, max_depth=6, max_features=auto, n_estimators=20, n_jobs=-1 
[CV]  criterion=mae, max_depth=6, max_features=auto, n_estimators=20, n_jobs=-1, score=0.7060471858326147, total=  20.7s
[CV] criterion=mae, max_depth=6, max_features=auto, n_estimators=20, n_jobs

KeyboardInterrupt: 

In [93]:
# View best hyperparameters - gridsearch
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
print('Best criterion:', best_model.best_estimator_.get_params()['criterion'])
print('Best max_features:', best_model.best_estimator_.get_params()['max_features'])
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])

Best n_estimators: 10
Best criterion: mse
Best max_features: sqrt
Best max_depth: 4


In [86]:
# Predict target values
best_model.predict(X_test)

array([  434782.27662918,  1545393.07354615,   460916.06375604, ...,
         363795.8324056 ,   303319.00893556,   274419.95467592])

In [108]:
# View the accuracy score
print('Best score for data1:', best_model.best_score_) 

Best score for data1: 0.692467297353


In [103]:
from sklearn.model_selection import RandomizedSearchCV
random = RandomizedSearchCV(rf, hyperparameters, random_state=1, n_iter=60, cv=5, verbose=0, n_jobs=-1)

In [104]:
compare_random = random.fit(X_train, y_train)

In [105]:
# View best hyperparameters - randomizedsearch
print('Best n_estimators:', compare_random.best_estimator_.get_params()['n_estimators'])
print('Best criterion:', compare_random.best_estimator_.get_params()['criterion'])
print('Best max_features:', compare_random.best_estimator_.get_params()['max_features'])
print('Best max_depth:', compare_random.best_estimator_.get_params()['max_depth'])

Best n_estimators: 100
Best criterion: mse
Best max_features: auto
Best max_depth: 4


In [107]:
print('Best score for randomized search:', compare_random.best_score_)  

Best score for randomized search: 0.689046136744


In [24]:
# Note - if you not have xgboost installed in your anaconda, it won't work. Plus its expensive to run
#from xgboost.sklearn import XGBClassifier  
#from xgboost.sklearn import XGBRegressor
#xclas = XGBClassifier()  # and for classifier  
#xclas.fit(X_train, y_train)  
#xclas.predict(X_test)  

In [25]:
# normalizing using standardscaler: NOTE to scale our data, because our output in not binary (0, 1), if we're to normalize our 
#... data, it would heavly effect our score... some data are better to normalize and some are not...
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values, columns=X_train.columns.values)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)

In [26]:
from sklearn.decomposition import PCA

In [57]:
# Note - this cell takes more time to execute, just uncomment 'sns' to see the output
pca = PCA()
pca.fit(X_train)
cpts = pd.DataFrame(pca.transform(X_train))
x_axis = np.arange(1, pca.n_components_+1)
pca_scaled = PCA()
pca_scaled.fit(X_train_scaled)
cpts_scaled = pd.DataFrame(pca.transform(X_train_scaled))
# matplotlib boilerplate goes here
#sns.pairplot(cpts_scaled)

In [54]:
rf2 = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0)
rf2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

In [55]:
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
predicted_train = rf2.predict(X_train)
predicted_test = rf2.predict(X_test)
test_score = r2_score(y_test, predicted_test)
# finds corr between two vairble that are related
# FIND: http://statisticslectures.com/topics/spearman/
spearman = spearmanr(y_test, predicted_test)
#Pearsonr measures the strength of the linear relationship between two variables. Pearson�s r is always between -1 and 1.
pearson = pearsonr(y_test, predicted_test)

In [56]:
print('Out-of-bag R-2 score estimate: ', round(rf2.oob_score_, 3))
print('Test data R-2 score: ', round(test_score, 3))
print('Test data Spearman correlation: ', round(spearman[0], 3))
print('Test data Pearson correlation: ',round(pearson[0], 3))

Out-of-bag R-2 score estimate:  0.801
Test data R-2 score:  0.815
Test data Spearman correlation:  0.923
Test data Pearson correlation:  0.903
