### Model Creation:

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings 
import xgboost 

from math import sqrt 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso  
from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_score 

%matplotlib inline 
pd.set_option('display.max_columns',None)
warnings.simplefilter(action='ignore')



### Load Data

In [2]:
df = pd.read_csv('final_zillow_dataset.csv')
df.shape

(70260, 21)

In [3]:
df.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,finishedsquarefeet12,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,taxamount,logerror,yeardifference,propertyzoningdesc_labels
0,10726315,3.0,3.0,4.0,2445.0,6037.0,2.0,34.1843,-118.657,63878.0,266.0,60.371352,12447.0,3101.0,96389.0,0.0,1.0,7170.22,0.0383,33.0,627
1,10727091,2.0,3.0,7.0,1160.0,6037.0,2.0,34.188121,-118.646361,7688.0,261.0,60.371352,12447.0,3101.0,96342.0,0.0,1.0,5421.96,0.003,55.0,638
2,10730788,2.0,4.0,7.0,1570.0,6037.0,2.0,34.188446,-118.616724,11308.0,261.0,60.371351,12447.0,3101.0,96342.0,0.0,1.0,5097.78,-0.002,56.0,638
3,10735394,3.0,4.0,4.0,2863.0,6037.0,2.0,34.171439,-118.646883,16376.0,261.0,60.371373,12447.0,3101.0,96389.0,0.0,1.0,7475.21,0.0129,52.0,632
4,10743512,3.0,2.0,4.0,1394.0,6037.0,2.0,34.15289,-118.791494,77543.0,269.0,60.378003,34278.0,3101.0,96385.0,0.0,1.0,5550.36,0.063,17.0,769


### Train Test Split

In [4]:
X = df.drop('logerror',axis =1)
y = df['logerror']
new_df = df.copy()
X.shape ,y.shape

((70260, 20), (70260,))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state= 100)
X_train.shape ,y_train.shape ,X_test.shape, y_test.shape

((52695, 20), (52695,), (17565, 20), (17565,))

### Feature Scalling

In [6]:
train_vars = [ var for var in X_train.columns if var not in ['parcelid','logerror']]
len(train_vars)

19

In [7]:
scaler = StandardScaler()
scaler.fit(X_train[train_vars])
X_train[train_vars] =scaler.transform(X_train[train_vars])
X_test[train_vars] =scaler.transform(X_test[train_vars])

In [8]:
X_train.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,finishedsquarefeet12,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,taxamount,yeardifference,propertyzoningdesc_labels
15602,10934030,1.039614,1.077319,-1.450606,1.368148,-0.672408,-0.526789,0.533872,-0.840819,-0.282457,-0.50568,-0.703341,-0.813301,0.672408,-1.208086,-0.439343,-0.081247,-0.954568,1.254891,-0.470974
27292,12116223,-1.441419,0.059583,0.629159,0.535918,-0.672408,-0.526789,0.674431,-0.036819,-0.267228,-0.50568,-0.671488,0.974128,0.672408,-0.60415,-0.439343,-0.081247,-0.737606,2.552634,1.68683
37134,11533593,-0.200903,0.059583,0.629159,-0.728171,-0.672408,1.921281,0.016094,-1.104821,-0.334003,-0.50568,-0.690187,-0.813301,0.672408,-1.213056,-0.439343,-0.081247,1.636555,0.779052,-0.470974
19660,13078527,-0.200903,1.077319,0.629159,-0.591716,-0.672408,-0.526789,0.122551,0.902323,-0.274704,-0.50568,-0.677385,0.333907,0.672408,-0.032524,-0.439343,-0.081247,-0.229807,0.173439,2.541898
19926,12163237,-0.200903,-0.958153,0.629159,-0.860129,-0.672408,-0.526789,0.764055,-0.126408,-0.311506,-0.50568,-0.671698,0.103886,0.672408,-0.671254,-0.439343,-0.081247,-0.688036,1.903762,0.161445


### Dropping Parcel ID

In [9]:
X_train_new = X_train.copy()
X_test_new = X_test.copy()

X_train.drop(columns="parcelid", axis=1, inplace=True)
X_test.drop(columns="parcelid", axis=1, inplace=True)

### Linear Regression Model :

In [10]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [11]:
linear_reg_pred = linear_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, linear_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, linear_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, linear_reg_pred))))
print()

Mean Absolute Error : 0.05271187478401562

Mean Squared Error : 0.007219607452279518

Root Mean Squared Error : 0.084968273209943



### Elastic Net Model

In [12]:
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train, y_train)

In [13]:
elastic_net_pred = elastic_net.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, elastic_net_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, elastic_net_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, elastic_net_pred))))
print()

Mean Absolute Error : 0.052772037640842616

Mean Squared Error : 0.007263564946868809

Root Mean Squared Error : 0.08522655071554175



### Ridge Regression Model

In [14]:
ridge_reg = Ridge(alpha=1, solver='cholesky')
ridge_reg.fit(X_train, y_train)

In [15]:
ridge_reg_pred = ridge_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, ridge_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, ridge_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, ridge_reg_pred))))
print()


Mean Absolute Error : 0.05271298978108195

Mean Squared Error : 0.0072190878626148345

Root Mean Squared Error : 0.08496521560388601



### Lasso Regression Model

In [16]:
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train, y_train)


In [17]:
lasso_reg_pred = lasso_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, lasso_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, lasso_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, lasso_reg_pred))))
print()

Mean Absolute Error : 0.052772037640842616

Mean Squared Error : 0.007263564946868809

Root Mean Squared Error : 0.08522655071554175



### XG Boost Regression Model:

In [18]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)

In [19]:
xgb_reg_pred = xgb_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, xgb_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, xgb_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, xgb_reg_pred))))
print()

Mean Absolute Error : 0.05406276035819799

Mean Squared Error : 0.0073732010599922

Root Mean Squared Error : 0.08586734571414328



### Ada Boost Regression Model

In [20]:
adaboost_reg  = AdaBoostRegressor()

adaboost_reg.fit(X_train, y_train)

In [21]:
adaboost_reg_pred = adaboost_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, adaboost_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, adaboost_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, adaboost_reg_pred))))
print()

Mean Absolute Error : 0.056141330612798725

Mean Squared Error : 0.007477801056104142

Root Mean Squared Error : 0.08647427973741176



### Gradient Boosting Regression Model

In [22]:
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)

In [23]:
gb_reg_pred = gb_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, gb_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, gb_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, gb_reg_pred))))
print()

Mean Absolute Error : 0.052556977260597104

Mean Squared Error : 0.007177849393503764

Root Mean Squared Error : 0.08472218949899586



### Decision Tree Regressor

In [24]:
tree_reg = DecisionTreeRegressor(max_depth=5)

tree_reg.fit(X_train, y_train)

In [25]:
tree_reg_pred = tree_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, tree_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, tree_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, tree_reg_pred))))
print()

Mean Absolute Error : 0.05288515507033528

Mean Squared Error : 0.007249919195504381

Root Mean Squared Error : 0.08514645732797332



### Random Forest Regression Model

In [26]:
forest_reg = RandomForestRegressor(n_estimators= 500, max_depth=6)
forest_reg.fit(X_train, y_train)

In [27]:
forest_reg_pred = forest_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, forest_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, forest_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, forest_reg_pred))))
print()

Mean Absolute Error : 0.05248971738752841

Mean Squared Error : 0.007176432030522689

Root Mean Squared Error : 0.08471382431765602



### Cross Valiadation

In [28]:
scores = cross_val_score(forest_reg,X_train,y_train, scoring="neg_mean_squared_error",cv=5)

In [29]:
forest_reg_rmse_scores = np.sqrt(-scores)
forest_reg_rmse_scores

array([0.08604817, 0.08501808, 0.08288711, 0.08323234, 0.08451529])

In [31]:
param_grid = [
    {'n_estimators': [300, 400, 500], 'max_features': [2,4,6]},
    {'bootstrap':[False], 'n_estimators': [300, 400, 500], 'max_features': [2, 4, 6]}
]

forest_regressor = RandomForestRegressor()

grid_search = GridSearchCV(forest_regressor, param_grid, scoring='neg_mean_squared_error', return_train_score=True, cv=3 )

In [32]:
grid_search.fit(X_train, y_train)

In [33]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 400}

In [34]:
grid_search.best_estimator_

In [35]:
final_predictor = grid_search.best_estimator_
final_predictor.fit(X_train, y_train)
final_pred = final_predictor.predict(X_test)

In [36]:
print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, final_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, final_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, final_pred))))
print()

Mean Absolute Error : 0.05477768568338733

Mean Squared Error : 0.0073870795448363735

Root Mean Squared Error : 0.08594812124087631



### Checking Feature Importance:

In [37]:
feature_importances = grid_search.best_estimator_.feature_importances_
attrs = list(df.select_dtypes(include=['float64','int64']))

sorted(zip(attrs, feature_importances),reverse=True)

[('unitcnt', 0.14054060976634628),
 ('taxamount', 0.09144399031827996),
 ('roomcnt', 0.0015588581086157913),
 ('regionidzip', 0.008323685702601476),
 ('regionidcounty', 0.059182397329641534),
 ('regionidcity', 0.0009690613546968794),
 ('rawcensustractandblock', 0.030877165389188655),
 ('propertylandusetypeid', 0.0792292548953435),
 ('parcelid', 0.02329583295446343),
 ('lotsizesquarefeet', 0.008578222075125288),
 ('longitude', 0.11135792184575267),
 ('logerror', 0.049258516127689944),
 ('latitude', 0.11428296542502726),
 ('heatingorsystemtypeid', 0.11521375801030016),
 ('fips', 0.009744371642109128),
 ('finishedsquarefeet12', 0.0009477864363654509),
 ('buildingqualitytypeid', 0.11640374315493218),
 ('bedroomcnt', 0.008582299977064067),
 ('bathroomcnt', 0.03020955948645643)]

### Savig Predictions:

In [39]:
model_pred = pd.DataFrame({'parcelid':X_test_new.parcelid, 'logerror':final_pred})
model_pred.to_csv('model-predictions.csv', index=False)
model_pred.head()

Unnamed: 0,parcelid,logerror
11056,11923769,0.010713
20399,14196198,0.003561
51034,12408039,0.013264
41560,10931254,0.013991
69896,14665147,-0.036234
