### Modelagem

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

%matplotlib inline
pd.set_option('display.max_columns',None)
warnings.simplefilter(action='ignore')

### Carregando os dados

In [3]:
df = pd.read_csv('zillow_dataset_final.csv')
df.shape

(30257, 21)

In [4]:
df.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,finishedsquarefeet12,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,taxamount,logerror,yeardifference,propertyzoningdesc_labels
0,17073783,2.5,3.0,7.0,1264.0,6111,2.0,34.303597,-119.287236,1735.0,265,61.110022,34543.0,2061,97081.0,5.0,1.0,2015.06,0.0953,29.0,428
1,17088994,1.0,2.0,7.0,777.0,6111,2.0,34.272866,-119.198911,6000.0,266,61.110015,34543.0,2061,97083.0,4.0,1.0,2581.3,0.0198,25.0,428
2,17100444,2.0,3.0,7.0,1101.0,6111,2.0,34.340801,-119.07961,6569.0,261,61.110007,26965.0,2061,97113.0,5.0,1.0,591.64,0.006,59.0,428
3,17102429,1.5,2.0,7.0,1554.0,6111,2.0,34.354313,-119.076405,7400.0,261,61.110008,26965.0,2061,97113.0,5.0,1.0,682.78,-0.0566,50.0,428
4,17109604,2.5,4.0,7.0,2415.0,6111,2.0,34.266578,-119.165392,6326.0,261,61.110014,34543.0,2061,97084.0,8.0,1.0,5886.92,0.0573,31.0,428


### Separando os conjuntos de treino e teste.

In [5]:
X = df.drop(['logerror','parcelid'], axis=1)
y = df['logerror']

X.shape, y.shape

((30257, 19), (30257,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 100)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((21179, 19), (21179,), (9078, 19), (9078,))

### Features Scaler

In [7]:
train_vars = [var for var in X_train.columns if var not in ['parcelid', 'logerror']]

In [8]:
scaler = StandardScaler()

scaler.fit(X_train[train_vars]) 

X_train[train_vars] = scaler.transform(X_train[train_vars])

X_test[train_vars] = scaler.transform(X_test[train_vars])

In [9]:
X_train.head()

Unnamed: 0,bathroomcnt,bedroomcnt,buildingqualitytypeid,finishedsquarefeet12,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,taxamount,yeardifference,propertyzoningdesc_labels
22778,-0.277862,-0.028849,0.549229,-0.503454,0.488405,-0.391736,-0.56672,0.720386,-0.213732,-0.137719,0.477622,-0.185969,-1.533559,0.227684,1.926003,-0.071666,-0.322545,0.603208,-0.434284
24559,-0.277862,0.841501,-1.249831,-0.111301,-0.575955,-0.391736,0.449702,1.135856,-0.198896,-0.137719,-0.575999,0.133081,0.716534,-0.029467,-0.530556,-0.071666,-0.02835,-0.364768,1.139187
7036,-0.277862,-0.028849,0.549229,-0.560924,0.488405,-0.391736,-1.013949,0.497769,-0.209156,-0.137719,0.481937,-0.177033,-1.533559,0.192361,1.575066,-0.071666,-0.420241,0.182349,-0.434284
27619,1.729074,0.841501,0.549229,1.679303,-0.575955,-0.391736,0.253694,0.200673,-0.191636,-0.137719,-0.57203,0.383308,0.716534,-0.011099,-0.530556,-0.071666,0.057289,2.454988,-1.783942
26407,-0.277862,0.841501,0.549229,-0.341184,0.488405,-0.391736,-0.899019,0.983082,-0.209495,-0.137719,0.480757,0.31488,-1.533559,0.2112,1.926003,-0.071666,-0.477188,-0.154338,-0.434284


#### Log de métricas.

In [21]:
metricas = []

### Linear Regression Model:

In [22]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

LinearRegression()

In [23]:
linear_reg_pred = linear_reg.predict(X_test)
mae = mean_absolute_error(y_test, linear_reg_pred)
metricas.append(mae)

print('Mean Absolute Error : {}'.format(mae))

Mean Absolute Error : 0.0692746833232062


### Ada Boost Regression Model:

In [24]:
adaboost_reg = AdaBoostRegressor()
adaboost_reg.fit(X_train, y_train)

AdaBoostRegressor()

In [25]:
adaboost_reg_pred = adaboost_reg.predict(X_test)

mae = mean_absolute_error(y_test, adaboost_reg_pred)
metricas.append(mae)

print('Mean Absolute Error : {}'.format(mae))

Mean Absolute Error : 0.25583558957899893


### Gradient Boosting Regression Model:

In [26]:
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor()

In [27]:
gb_reg_pred = gb_reg.predict(X_test)

mae = mean_absolute_error(y_test, gb_reg_pred)
metricas.append(mae)

print('Mean Absolute Error : {}'.format(mae))

Mean Absolute Error : 0.06987745229611028


### Decision Tree Regressor:

In [28]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [29]:
tree_reg_pred = tree_reg.predict(X_test)

mae = mean_absolute_error(y_test, tree_reg_pred)
metricas.append(mae)

print('Mean Absolute Error : {}'.format(mae))

Mean Absolute Error : 0.11150062238378498


### Random Forest Regression Model:

In [30]:
forest_reg = RandomForestRegressor(n_estimators=400)
forest_reg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=400)

In [31]:
forest_reg_pred = forest_reg.predict(X_test)

mae = mean_absolute_error(y_test, forest_reg_pred)
metricas.append(mae)

print('Mean Absolute Error : {}'.format(mae))

Mean Absolute Error : 0.07547945843973046


#### Verificando a melhor métrica.

In [32]:
metricas

[0.0692746833232062,
 0.25583558957899893,
 0.06987745229611028,
 0.11150062238378498,
 0.07547945843973046]

In [34]:
min(metricas)

0.0692746833232062

### Dump model to disk

In [22]:
joblib.dump(linear_reg, "model.pkl")

['model.pkl']