# Importing Initial Libraries

In [1]:
import numpy as np
import pandas as pd

# Data Preprocessing

## Read Data

In [2]:
df = pd.read_csv('Data/Input/train.csv')
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Break Into Training and Target Set

In [3]:
# Break into training/target set
target_column_name = 'median_house_value'
traincols = []
for column in df.columns:
    if (column != target_column_name):
        traincols.append(column)


x_train = df[traincols]
y_train = df[target_column_name]
x_train.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001


## One Hot Encode Categorical Columns

In [4]:
# Union Function
def union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list

# Check DType and Get Categorical Columns
categorical_columns = []
for column in df.columns:
    if (df[column].dtype != np.float64 and df[column].dtype != np.int64 and df[column].dtype != np.int32 and df[column].dtype != np.float32):
        categorical_columns.append(column)

# One Hot Encode (Remove Last)
encoded_columns = []
for column in categorical_columns:
    prefix = 'one_hot_encoded_' + column + '_'
    x_train = pd.get_dummies(x_train, columns=[column] , prefix=prefix)
    encoded_columns = union(encoded_columns, [i for i in list(x_train.columns) if prefix in i][:-1])
    
not_encoded_columns = [i for i in list(x_train.columns) if "one_hot_encoded_" not in i]
x_train = x_train[union(encoded_columns, not_encoded_columns)]

# Fix for XGBoost
for column in list(x_train.columns):
    if ("<" in column):
        x_train.rename(index=str, columns={column: column.replace("<", "")}, inplace=True)
x_train.describe()

Unnamed: 0,total_rooms,households,housing_median_age,population,one_hot_encoded_ocean_proximity__NEAR BAY,total_bedrooms,median_income,one_hot_encoded_ocean_proximity__1H OCEAN,one_hot_encoded_ocean_proximity__ISLAND,longitude,one_hot_encoded_ocean_proximity__INLAND,latitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,2635.763081,499.53968,28.639486,1425.476744,0.11095,537.870553,3.870671,0.442636,0.000242,-119.569704,0.317393,35.631861
std,2181.615252,382.329753,12.585558,1132.462122,0.314077,421.38507,1.899822,0.49671,0.015563,2.003532,0.465473,2.135952
min,2.0,1.0,1.0,3.0,0.0,1.0,0.4999,0.0,0.0,-124.35,0.0,32.54
25%,1447.75,280.0,18.0,787.0,0.0,296.0,2.5634,0.0,0.0,-121.8,0.0,33.93
50%,2127.0,409.0,29.0,1166.0,0.0,435.0,3.5348,0.0,0.0,-118.49,0.0,34.26
75%,3148.0,605.0,37.0,1725.0,0.0,647.0,4.74325,1.0,0.0,-118.01,1.0,37.71
max,39320.0,6082.0,52.0,35682.0,1.0,6445.0,15.0001,1.0,1.0,-114.31,1.0,41.95


## Impute Null Values

In [5]:
from sklearn.impute import SimpleImputer as Imputer

imputer = Imputer(strategy='mean', copy=False)
x_train = pd.DataFrame(data=imputer.fit_transform(x_train), columns=list(x_train.columns))
x_train.describe()

Unnamed: 0,total_rooms,households,housing_median_age,population,one_hot_encoded_ocean_proximity__NEAR BAY,total_bedrooms,median_income,one_hot_encoded_ocean_proximity__1H OCEAN,one_hot_encoded_ocean_proximity__ISLAND,longitude,one_hot_encoded_ocean_proximity__INLAND,latitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,2635.763081,499.53968,28.639486,1425.476744,0.11095,537.870553,3.870671,0.442636,0.000242,-119.569704,0.317393,35.631861
std,2181.615252,382.329753,12.585558,1132.462122,0.314077,419.266592,1.899822,0.49671,0.015563,2.003532,0.465473,2.135952
min,2.0,1.0,1.0,3.0,0.0,1.0,0.4999,0.0,0.0,-124.35,0.0,32.54
25%,1447.75,280.0,18.0,787.0,0.0,297.0,2.5634,0.0,0.0,-121.8,0.0,33.93
50%,2127.0,409.0,29.0,1166.0,0.0,438.0,3.5348,0.0,0.0,-118.49,0.0,34.26
75%,3148.0,605.0,37.0,1725.0,0.0,643.25,4.74325,1.0,0.0,-118.01,1.0,37.71
max,39320.0,6082.0,52.0,35682.0,1.0,6445.0,15.0001,1.0,1.0,-114.31,1.0,41.95


### Feature Scaling

In [6]:
from sklearn.preprocessing import RobustScaler as scaler

sc_X = scaler(copy=False)
x_train[not_encoded_columns] = sc_X.fit_transform(x_train[not_encoded_columns])
x_train.describe()

Unnamed: 0,total_rooms,households,housing_median_age,population,one_hot_encoded_ocean_proximity__NEAR BAY,total_bedrooms,median_income,one_hot_encoded_ocean_proximity__1H OCEAN,one_hot_encoded_ocean_proximity__ISLAND,longitude,one_hot_encoded_ocean_proximity__INLAND,latitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,0.299228,0.278584,-0.018974,0.276628,0.11095,0.288435,0.1540799,0.442636,0.000242,-0.284882,0.317393,0.362926
std,1.283114,1.176399,0.662398,1.207316,0.314077,1.210878,0.8715378,0.49671,0.015563,0.528636,0.465473,0.565067
min,-1.249816,-1.255385,-1.473684,-1.239872,0.0,-1.262094,-1.392252,0.0,0.0,-1.546174,0.0,-0.455026
25%,-0.3995,-0.396923,-0.578947,-0.404051,0.0,-0.40722,-0.445627,0.0,0.0,-0.873351,0.0,-0.087302
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.018608e-16,0.0,0.0,0.0,0.0,0.0
75%,0.6005,0.603077,0.421053,0.595949,0.0,0.59278,0.554373,1.0,0.0,0.126649,1.0,0.912698
max,21.875018,17.455385,1.210526,36.797441,1.0,17.348736,5.259674,1.0,1.0,1.102902,1.0,2.034392


## Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=1)
x_train.describe()

Unnamed: 0,total_rooms,households,housing_median_age,population,one_hot_encoded_ocean_proximity__NEAR BAY,total_bedrooms,median_income,one_hot_encoded_ocean_proximity__1H OCEAN,one_hot_encoded_ocean_proximity__ISLAND,longitude,one_hot_encoded_ocean_proximity__INLAND,latitude
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,0.30148,0.281826,-0.020817,0.276394,0.111252,0.291806,0.156593,0.444586,0.000303,-0.286092,0.314499,0.363014
std,1.28006,1.176737,0.662423,1.198035,0.314454,1.209562,0.867759,0.496935,0.017399,0.529704,0.46433,0.565367
min,-1.249816,-1.255385,-1.473684,-1.239872,0.0,-1.262094,-1.392252,0.0,0.0,-1.532982,0.0,-0.455026
25%,-0.397589,-0.4,-0.578947,-0.405117,0.0,-0.40722,-0.441659,0.0,0.0,-0.875989,0.0,-0.087302
50%,0.0,0.003077,0.0,-0.002132,0.0,0.0,0.004473,0.0,0.0,0.0,0.0,0.0
75%,0.601088,0.603846,0.421053,0.593817,0.0,0.595668,0.55747,1.0,0.0,0.126649,1.0,0.912698
max,21.875018,15.227692,1.210526,36.797441,1.0,16.670036,5.259674,1.0,1.0,1.102902,1.0,2.034392


# Exploratory Data Analysis

## Import Libraries

In [20]:
# Import Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Import Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Import Evaluation Metrics
from sklearn.model_selection import cross_val_score

## Show Variable Corelation

In [8]:
# Visualise Data
from Visualisation.Visualisation import Visualisation 

# Show Variable Corelation
vs = Visualisation(x_train, y_train, target_column_name, [])
vs.show_variable_corelation()

<Figure size 1200x1000 with 2 Axes>

## Show Feature Importance

In [23]:
# Random Forest Feature Importance
clf = RandomForestRegressor(n_estimators=100)
clf.fit(x_train, y_train)
fi = clf.feature_importances_
data = [go.Bar(
        x=list(x_train.columns),
        y=list(fi)
    )]

layout = go.Layout(
    title='Random Forest Feature Importance',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Random Forest Feature Importance.html')

# Gradient Boosting Feature Importance
clf = GradientBoostingRegressor(n_estimators=100)
clf.fit(x_train, y_train)
fi = clf.feature_importances_
data = [go.Bar(
        x=list(x_train.columns),
        y=list(fi)
    )]

layout = go.Layout(
    title='Gradient Boosting Feature Importance',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Gradient Boosting Feature Importance.html')




# Model Evaluation

## Import Libraries

In [18]:
# Import Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Import Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Import Evaluation Metrics
from sklearn.model_selection import cross_val_score


## Show Cross Validated Scores

In [28]:
cv = 10

clf = LinearRegression()
cv_train_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=cv, n_jobs=-1)
cv_test_score = cross_val_score(estimator=clf, X=x_test, y=y_test, cv=cv, n_jobs=-1)
print('Linear Regression Cross Validated Score => Train: ' + str(np.mean(cv_train_score)) + " Test: " + str(np.mean(cv_test_score)))

clf = RandomForestRegressor()
cv_train_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=cv, n_jobs=-1)
cv_test_score = cross_val_score(estimator=clf, X=x_test, y=y_test, cv=cv, n_jobs=-1)
print('Random Forest Cross Validated Score => Train: ' + str(np.mean(cv_train_score)) + " Test: " + str(np.mean(cv_test_score)))

clf = KNeighborsRegressor()
cv_train_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=cv, n_jobs=-1)
cv_test_score = cross_val_score(estimator=clf, X=x_test, y=y_test, cv=cv, n_jobs=-1)
print('KNN Cross Validated Score => Train: ' + str(np.mean(cv_train_score)) + " Test: " + str(np.mean(cv_test_score)))

clf = GradientBoostingRegressor()
cv_train_score = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=cv, n_jobs=-1)
cv_test_score = cross_val_score(estimator=clf, X=x_test, y=y_test, cv=cv, n_jobs=-1)
print('Gradient Boosting Cross Validated Score => Train: ' + str(np.mean(cv_train_score)) + " Test: " + str(np.mean(cv_test_score)))



Linear Regression Cross Validated Score => Train: 0.6452277373197931 Test: 0.6327545765234855
Random Forest Cross Validated Score => Train: 0.8024809604531633 Test: 0.7447408586760067
KNN Cross Validated Score => Train: 0.7088353302363746 Test: 0.6524297179765911
Gradient Boosting Cross Validated Score => Train: 0.772441578245516 Test: 0.7539394107584945
