In [1]:
#### Projeto: Desafio ZAP
#### Programa para Treinar o Modelo do ZAP com hiperparâmetros iniciais
#### Autor: Rodolfo Bugarin

In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
#
# Carregar os Dataframes
#

In [4]:
X = pd.read_pickle('df_X_features.pickle')
y = pd.read_pickle('df_y_label.pickle') 

In [5]:
X_train = pd.read_pickle('df_X_train_features.pickle')
X_test = pd.read_pickle('df_X_test_features.pickle') 

In [6]:
y_train = pd.read_pickle('df_y_train_label.pickle')
y_test = pd.read_pickle('df_y_test_label.pickle') 

In [7]:
# Scaling

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit the training data
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:
# Apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [12]:
modelo_lr = lr.fit(X_train, y_train)

In [13]:
predictions = modelo_lr.predict(X_test)

In [14]:
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 291835697493.7581
R2: 0.7119


In [15]:
import statsmodels.api as sm

In [16]:
cols = list(X.columns)
X_train_statsmodels = pd.DataFrame(X_train, columns=cols)
X_train_statsmodels = sm.add_constant(X_train_statsmodels)
Y_train_statsmodels = pd.DataFrame(y_train).reset_index(drop=True)

In [17]:
modelo_lr_statsmodels = sm.OLS(Y_train_statsmodels, X_train_statsmodels).fit()

In [18]:
modelo_lr_statsmodels.summary()

0,1,2,3
Dep. Variable:,Preco_Venda,R-squared:,0.621
Model:,OLS,Adj. R-squared:,0.621
Method:,Least Squares,F-statistic:,5606.0
Date:,"Thu, 05 Sep 2019",Prob (F-statistic):,0.0
Time:,18:05:29,Log-Likelihood:,-759750.0
No. Observations:,51316,AIC:,1520000.0
Df Residuals:,51300,BIC:,1520000.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.636e+05,2874.429,265.667,0.000,7.58e+05,7.69e+05
bathrooms,-3.007e+04,4700.838,-6.397,0.000,-3.93e+04,-2.09e+04
bedrooms,-1.376e+05,4131.513,-33.300,0.000,-1.46e+05,-1.29e+05
parkingspaces,1.656e+05,5117.097,32.355,0.000,1.56e+05,1.76e+05
pricinginfos_monthlycondofee,6.834e+04,3269.293,20.904,0.000,6.19e+04,7.47e+04
pricinginfos_rentaltotalprice,-7.217e-12,1.59e-12,-4.544,0.000,-1.03e-11,-4.1e-12
pricinginfos_yearlyiptu,1.26e+04,2960.012,4.257,0.000,6799.029,1.84e+04
suites,2.291e+04,5082.838,4.507,0.000,1.29e+04,3.29e+04
totalareas,-1455.0624,2906.076,-0.501,0.617,-7151.002,4240.877

0,1,2,3
Omnibus:,138133.617,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20775083575.73
Skew:,32.189,Prob(JB):,0.0
Kurtosis:,3119.434,Cond. No.,1.6e+17


In [19]:
X_test_statsmodels = pd.DataFrame(X_test, columns=cols)
X_test_statsmodels = sm.add_constant(X_test_statsmodels)
predictions = modelo_lr_statsmodels.predict(X_test_statsmodels)

In [20]:
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 291844314888.1086
R2: 0.7119


In [21]:
#Backward Elimination
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
    p= []
    modelo_lr_statsmodels = sm.OLS(Y_train_statsmodels,X_train_statsmodels).fit()
    p = pd.Series(modelo_lr_statsmodels.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
        X_train_statsmodels.drop([feature_with_p_max], axis=1, inplace=True)
        X_test_statsmodels.drop([feature_with_p_max], axis=1, inplace=True)
    else:
        break
selected_features_BE = cols
print(selected_features_BE)

['bathrooms', 'bedrooms', 'parkingspaces', 'pricinginfos_monthlycondofee', 'pricinginfos_yearlyiptu', 'suites', 'usableareas', 'Zona_Centro', 'Zona_Leste', 'Zona_Norte', 'Zona_Oeste', 'Distancia']


In [22]:
modelo_lr_statsmodels.summary()

0,1,2,3
Dep. Variable:,Preco_Venda,R-squared:,0.621
Model:,OLS,Adj. R-squared:,0.621
Method:,Least Squares,F-statistic:,7008.0
Date:,"Thu, 05 Sep 2019",Prob (F-statistic):,0.0
Time:,18:05:29,Log-Likelihood:,-759750.0
No. Observations:,51316,AIC:,1520000.0
Df Residuals:,51303,BIC:,1520000.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.636e+05,2874.448,265.666,0.000,7.58e+05,7.69e+05
bathrooms,-3.015e+04,4700.550,-6.413,0.000,-3.94e+04,-2.09e+04
bedrooms,-1.376e+05,4131.314,-33.315,0.000,-1.46e+05,-1.3e+05
parkingspaces,1.656e+05,5116.738,32.361,0.000,1.56e+05,1.76e+05
pricinginfos_monthlycondofee,6.841e+04,3268.814,20.928,0.000,6.2e+04,7.48e+04
pricinginfos_yearlyiptu,1.265e+04,2959.907,4.273,0.000,6845.982,1.84e+04
suites,2.298e+04,5082.565,4.521,0.000,1.3e+04,3.29e+04
usableareas,7.165e+05,5280.541,135.680,0.000,7.06e+05,7.27e+05
Zona_Centro,-5.75e+04,3417.695,-16.823,0.000,-6.42e+04,-5.08e+04

0,1,2,3
Omnibus:,138125.043,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20767335602.387
Skew:,32.184,Prob(JB):,0.0
Kurtosis:,3118.853,Cond. No.,4.43


In [23]:
modelo_lr_statsmodels.params

const                           763641.907086
bathrooms                       -30146.489783
bedrooms                       -137632.840001
parkingspaces                   165583.231879
pricinginfos_monthlycondofee     68411.040126
pricinginfos_yearlyiptu          12647.429323
suites                           22980.522463
usableareas                     716464.605566
Zona_Centro                     -57495.972145
Zona_Leste                      -36232.022849
Zona_Norte                      -36275.064654
Zona_Oeste                       -6241.465685
Distancia                      -100558.524276
dtype: float64

In [24]:
# Decision Tree

In [25]:
from sklearn.tree import DecisionTreeRegressor

In [26]:
dt = DecisionTreeRegressor(max_depth=50, min_samples_split=20)

In [27]:
modelo_dr = dt.fit(X_train, y_train)

In [28]:
predictions = modelo_dr.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 147415641867.3716
R2: 0.8545


In [29]:
# Random Forest

In [30]:
from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [31]:
rf = RandomForestRegressor(n_estimators=200, max_depth=None, max_features='auto', min_samples_split=20)

In [32]:
modelo_rf = rf.fit(X_train, y_train)

In [33]:
predictions = modelo_rf.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 109489178245.6504
R2: 0.8919


In [34]:
# Boosting

In [35]:
from sklearn import ensemble
params = {'n_estimators': 500, 'max_depth': 30, 'min_samples_split': 50, 
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params, random_state=42)

In [36]:
modelo_clf = clf.fit(X_train, y_train)
predictions = modelo_clf.predict(X_test)

In [37]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 101635595220.5466
R2: 0.8997


In [38]:
# Neural Network

In [39]:
from sklearn.neural_network import MLPRegressor

In [40]:
mlp = MLPRegressor(hidden_layer_sizes=(18,18,18), random_state=42, max_iter=2000, activation='relu')

In [41]:
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)

In [42]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 191226893046.4591
R2: 0.8112


In [43]:
#
# Salvar os modelos 
#

In [44]:
outfile = open('modelo_lr.pickle','wb')
pickle.dump(modelo_lr, outfile)
outfile.close()

In [45]:
outfile = open('modelo_dr.pickle','wb')
pickle.dump(modelo_dr, outfile)
outfile.close()

In [46]:
outfile = open('modelo_rf.pickle','wb')
pickle.dump(modelo_rf, outfile)
outfile.close()

In [47]:
outfile = open('modelo_clf.pickle','wb')
pickle.dump(modelo_clf, outfile)
outfile.close()

In [48]:
outfile = open('modelo_mlp.pickle','wb')
pickle.dump(mlp, outfile)
outfile.close()