In [8]:
# Recursively check for the most important features
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error


from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns

import warnings 

warnings.filterwarnings("ignore")

## Updates

So now I have a data frame where the geospatial data has been hashed into dummy data frames. I need to find which regions from each (top 50) are the most important in determining the outcome of the price.

To do this, I will use the RFE method from sklearn and set the K value to 100. Let's see what pops out!


In [9]:
df_hbd = pd.read_csv("../cleanedData/hdb_clusters.csv", index_col=0)
df_geo = pd.read_csv("../cleanedData/geohashed_full.csv", index_col=0)
df_KC = pd.read_csv("../raw/kc_house_data_train.csv", index_col=0)

In [10]:
# isolate the y value we wish to predict 
y = df_KC.price

In [11]:
# Concat everything together
X = pd.concat([df_geo,df_hbd], axis=1)

In [12]:
X.head()

Unnamed: 0,c22uz,c22vk,c22vm,c22vn,c22vp,c22vq,c22vr,c22vs,c22vt,c22vu,...,107,108,109,110,111,112,113,114,115,116
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y,               # Pass in our X and y
                                                    random_state=42,    # Abritary select a random_state 
                                                    test_size=.2        # Split test size to be 20% of full data.
                                                   )

In [14]:
lr = LinearRegression()

In [15]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
y_hat = lr.predict(X_test)

In [19]:
RMSE = mean_squared_error(y_test, y_hat)
RMSE**0.5

2.1360202183274458e+17

In [50]:
# Recursively check for the most important features
model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=100, step=1)
rfe = rfe.fit(X_train, y_train)
selected_rfe_features= pd.DataFrame({'Feature': list(X_train.columns),
                                    'Ranking': rfe.ranking_})
selected_rfe_features.sort_values(by='Ranking')[:100]

Unnamed: 0,Feature,Ranking
0,c22uz,1
83,c23m3,1
181,c23r3,1
80,c23kr,1
79,c23kc,1
...,...,...
35,c23hr,1
34,c23hg,1
51,c23j9,1
33,c23hf,1


In [51]:
selected_rfe_features= selected_rfe_features.sort_values(by='Ranking')

In [59]:
top_50 = selected_rfe_features.Feature[:100]
top_50 = X_train[top_50]
top_50

Unnamed: 0,c22uz,c23m3,c23r3,c23kr,c23kc,c23k9,c23k8,c23k2,c23jy,c23jx,...,c23hy,c23hx,c23hw,c23hv,c23j4,c23hr,c23hg,c23j9,c23hf,c23hu
2498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10932,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15638,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15099,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15560,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11964,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_hat = lr.predict(X_train)
RMSE_all_locations = mean_squared_error(y_train, y_hat)
RMSE_all_locations**0.5

282520.3005394987

In [61]:
lr = LinearRegression()
lr.fit(top_50, y_train)
y_hat = lr.predict(top_50)
RMSE_top_50 = mean_squared_error(y_train, y_hat)
RMSE_top_50**0.5

330155.5128896585

In [62]:
dummy = DummyRegressor()

In [65]:
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_train) 
dummy_rmse = mean_squared_error(y_train, y_pred)
dummy_rmse**0.5

375388.594116481

In [66]:
dummy.fit(top_50, y_train)
y_pred = dummy.predict(top_50) 
dummy_rmse = mean_squared_error(y_train, y_pred)
dummy_rmse**0.5

375388.594116481

In [69]:
len(df_hbd) == len(y)

True

In [71]:
# Recursively check for the most important features of just the knn density clusters
model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=100, step=1)
rfe = rfe.fit(df_hbd, y)
selected_rfe_features= pd.DataFrame({'Feature': list(df_hbd.columns),
                                    'Ranking': rfe.ranking_})
selected_rfe_features.sort_values(by='Ranking')

Unnamed: 0,Feature,Ranking
0,-1,1
80,79,1
79,78,1
78,77,1
77,76,1
...,...,...
117,116,1
34,33,1
30,29,1
33,32,1


In [73]:
lr = LinearRegression()
lr.fit(df_hbd, y)
y_hat = lr.predict(df_hbd)
RMSE_top_50 = mean_squared_error(y, y_hat)
RMSE_top_50**0.5

318320.62473491067

### Decision

I do not think that it is wise to use a k clustering algorithm. So far, i am happy with using the binning method. I will continue to use the current bins. lets test the full model with the best categories tomorrow. For the rest of the night, I will use feature normalization, 1 by 1 until I have the tightest model possible!! I will normalize all of my variables of interest. 

I will move ahead with the geospatial binning!