In [1]:
#### Projeto: Desafio ZAP
#### Programa para Precificar o Dataset source-4-ds-test.json do ZAP com os Modelos gerados com hiperparâmetros iniciais
#### Autor: Rodolfo Bugarin

In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import pickle

In [3]:
pd.options.display.float_format = '{:,.4f}'.format

In [4]:
#
# Carregar o Dataframe
#

In [5]:
df = pickle.load(open('source-4-ds-test.pickle', 'rb'))

In [6]:
df_original = pickle.load(open('source-4-ds-test.pickle', 'rb'))

In [7]:
pd.set_option('display.max_columns', 999)
df.head(n=3)

Unnamed: 0,address_city,address_country,address_district,address_geolocation_location_lat,address_geolocation_location_lon,address_geolocation_precision,address_locationid,address_neighborhood,address_state,address_street,address_streetnumber,address_unitnumber,address_zipcode,address_zone,bathrooms,bedrooms,createdat,description,id,images,listingstatus,owner,parkingspaces,pricinginfos_businesstype,pricinginfos_monthlycondofee,pricinginfos_period,pricinginfos_price,pricinginfos_rentaltotalprice,pricinginfos_yearlyiptu,publicationtype,publisherid,suites,title,totalareas,unittypes,updatedat,usableareas
0,São Paulo,BR,,-23.5572,-46.6628,GEOMETRIC_CENTER,BR>Sao Paulo>NULL>Sao Paulo>Centro>Consolacao,Consolação,São Paulo,Rua Bela Cintra,,,1415000,Centro,1.0,1,2015-10-20T20:52:41Z,Apartamentos de 1 dormitório na Rua Bela Cintr...,89224365f8,[https://s3-sa-east-1.amazonaws.com/vr.images....,ACTIVE,False,1.0,SALE,,,,,,STANDARD,967d57ce20,0.0,Apartamento Bela Cintra,47.0,APARTMENT,2018-11-08T15:02:53.953Z,47.0
1,São Paulo,BR,,-23.5929,-46.5819,ROOFTOP,BR>Sao Paulo>NULL>Sao Paulo>Zona Leste>Quinta ...,Quinta da Paineira,São Paulo,Rua Bruno Cavalcanti Feder,100.0,,3152155,Zona Leste,0.0,2,2018-07-31T06:10:07.427Z,"Ótima localização, próximo ao shopping Central...",363731333f,[http://images.ingaiasites.com.br/AolwiwJLLpET...,ACTIVE,False,1.0,SALE,0.0,,,,0.0,STANDARD,bddebf057a,0.0,"Apartamento residencial à venda, Quinta da Pai...",55.0,APARTMENT,2018-11-08T16:10:49.374Z,55.0
2,São Paulo,,,-23.4936,-46.6385,ROOFTOP,BR>Sao Paulo>NULL>Sao Paulo>Zona Norte>Santa T...,Chora Menino,São Paulo,Rua Copacabana,313.0,,2461000,,3.0,3,2018-01-25T13:57:14.203Z,Apartamento maravilhoso com ampla sala ( abriu...,6e6283378a,[https://ssl-w08cnn0135.websiteseguro.com/mira...,ACTIVE,False,2.0,SALE,686.0,,,,,STANDARD,d7190e8f4c,1.0,"Apartamento em Santa Terezinha - São Paulo, SP",,APARTMENT,2019-02-12T18:29:26.933Z,92.0


In [8]:
#
# Tratamento de Dados
#

In [9]:
# O Desafio ZAP pede estimar um preço de venda para os apartamentos no dataset de teste 
# Desta forma garantimos termos no dataframe somente  os imóveis "apartamento" e que estejam à venda (ou ambos)

In [10]:
df.shape

(16036, 37)

In [11]:
df.drop(df[df['pricinginfos_businesstype'] == 'RENTAL'].index, inplace = True) 

In [12]:
df.drop(df[df['unittypes'] != 'APARTMENT'].index, inplace = True) 

In [13]:
df.shape

(16036, 37)

In [14]:
# Criar a coluna Zona com base na coluna address_locationid e address_zone

In [15]:
import re 
  
# Função para extrair zonas de São Paulo 
def Zona_names(Zona_name): 
    if re.search('\BR>Sao Paulo>NULL>Sao Paulo>.*', Zona_name): 
        Zona_name = Zona_name[28:]
        if re.search('\>.*', Zona_name):
            pos = re.search('\>.*', Zona_name).start() 
            Zona_name = Zona_name[:pos]
        return Zona_name
    else: 
        return "" 
    
# Criação da nova coluna Zona 
df['Zona'] = df['address_locationid'].apply(Zona_names) 
  
# Print the updated dataframe 
df[['address_locationid', 'Zona']].head()

Unnamed: 0,address_locationid,Zona
0,BR>Sao Paulo>NULL>Sao Paulo>Centro>Consolacao,Centro
1,BR>Sao Paulo>NULL>Sao Paulo>Zona Leste>Quinta ...,Zona Leste
2,BR>Sao Paulo>NULL>Sao Paulo>Zona Norte>Santa T...,Zona Norte
3,BR>Sao Paulo>NULL>Sao Paulo>Zona Sul>Brooklin,Zona Sul
4,BR>Sao Paulo>NULL>Sao Paulo>Zona Oeste>Alto de...,Zona Oeste


In [16]:
df.groupby('Zona').Zona.count()

Zona
                13
Centro        1665
Zona Leste    2882
Zona Norte    2838
Zona Oeste    2317
Zona Sul      6321
Name: Zona, dtype: int64

In [17]:
df[df['Zona'] ==""].groupby('address_zone').address_zone.count()

address_zone
    13
Name: address_zone, dtype: int64

In [18]:
# Nos casos em que a nova coluna "Zona" ficou com contéudo "", preencher com o contéu da coluna "address_zone"
for item in df.iterrows():
    if item[1].Zona == "":
        df.loc[item[0], 'Zona'] = item[1].address_zone

In [19]:
# Nos casos em que a nova coluna "Zona" está preenhida com "Centro", alterar para "Zona Centro"
df['Zona'] = df['Zona'].apply(lambda x: "Zona Centro" if x == 'Centro' else x)

In [20]:
df.groupby('Zona').Zona.count()

Zona
                 13
Zona Centro    1665
Zona Leste     2882
Zona Norte     2838
Zona Oeste     2317
Zona Sul       6321
Name: Zona, dtype: int64

In [21]:
df.groupby('publicationtype').publicationtype.count()

publicationtype
PREMIUM       269
STANDARD    15767
Name: publicationtype, dtype: int64

In [22]:
#
# Converter as colunas categóricas em Dummies
#

In [23]:
import re

#### Função para remover os espaços e caracteres especiais.
def arrumar_string(v_string):
    novo_string = v_string.replace(' ', '_')
    novo_string = novo_string.replace('.', '')
    novo_string = novo_string.replace('(', '')
    novo_string = novo_string.replace(')', '')
    novo_string = novo_string.replace('-', '')
    novo_string = re.sub("\d", "x", novo_string)
    return novo_string

In [24]:
df.pricinginfos_businesstype = df.pricinginfos_businesstype.apply(arrumar_string)
df['pricinginfos_businesstype'] = df['pricinginfos_businesstype'].apply(lambda x: "SemBusinessDefinido" if x == "" else x)
df_aux = pd.get_dummies(df['pricinginfos_businesstype'])
df = pd.concat([df, df_aux], axis=1)

In [25]:
df.publicationtype = df.publicationtype.apply(arrumar_string)
df['publicationtype'] = df['publicationtype'].apply(lambda x: "SemPublicationDefinido" if x == "" else x)
df_aux = pd.get_dummies(df['publicationtype'])
df = pd.concat([df, df_aux], axis=1)

In [26]:
df.Zona = df.Zona.apply(arrumar_string)
df['Zona'] = df['Zona'].apply(lambda x: "SemZonaDefinida" if x == "" else x)
df_aux = pd.get_dummies(df['Zona'])
df = pd.concat([df, df_aux], axis=1)

In [27]:
#
# Eliminar as colunas que não serão utilizadas no modelo
#

In [28]:
# Eliminar a coluna target, variável a ser explicada.
(df.drop(['pricinginfos_price'], axis=1, inplace=True))

In [29]:
# Todas os imóveis são da cidade de São Paulo, então podemos remover cidade, estado e país
(df.drop(['address_city', 'address_country', 'address_state'], axis=1, inplace=True))

In [30]:
# Eliminar as colunas  tem quase todos os valores vazios
(df.drop(['address_district', 'pricinginfos_period'], axis=1, inplace=True))

In [31]:
# A coluna "address_zone" e "address_locationid" não são mais necessária, pois foram substituídas pela coluna "Zona"
(df.drop(['address_zone', 'address_locationid'], axis=1, inplace=True))

In [32]:
# Utilizamos a latitude e longitude como vetor de localização do imóvel, então todos os campos de endereço devem ser removidos 
(df.drop(['address_neighborhood', 'address_street', 'address_streetnumber', 'address_unitnumber', 'address_zipcode'], axis=1, inplace=True))

In [33]:
# Eliminar as colunas de contrale e id por serem inúteis ao modelo
(df.drop(['createdat', 'id', 'owner', 'publisherid', 'updatedat', 'address_geolocation_precision'], axis=1, inplace=True))

In [34]:
# Eliminar as colunas de descrição e de links
(df.drop(['description', 'images', 'title'], axis=1, inplace=True))

In [35]:
# Eliminar as colunas que têm valores únicos 
(df.drop(['listingstatus', 'unittypes'], axis=1, inplace=True))

In [36]:
# Eliminar as colunas categóricas
(df.drop(['pricinginfos_businesstype', 'publicationtype', 'Zona'], axis=1, inplace=True))

In [37]:
# DF para Guardar Medidas de Posicao de São Paulo
df_medidas = pd.DataFrame({'Cidade': ['São Paulo']})

In [38]:
#
# Carregar do Daframe de Medidas de Posição
#

In [39]:
df_medidas = pickle.load(open('df_medidas.pickle', 'rb'))

In [40]:
#
# Corrigindo os Missing e Outliers
#

In [41]:
features = df.columns.tolist()[0:11]

In [42]:
# Corrigindo Missings
for i in features:
    coluna = i + '_mediana'
    mediana = df_medidas.iloc[0][coluna]
    df.loc[df[i].isnull(), i] = mediana

In [43]:
# Corrigindo Outliers
for i in features:
    coluna = i + '_p99'
    p99 = df_medidas.iloc[0][coluna]
    df.loc[df[i] > p99, i] = p99

In [44]:
#
# Criar a coluna que mede a distância entre a geolocalização de referência (mediana da cidade) e o imóvel
#

In [45]:
# Funçao para calcular a distância entre dois pontos
from math import radians, degrees, sin, cos, asin, acos, sqrt, atan2

def great_circle(lat1, lon1, lat2, lon2):

    R = 6373.0
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [46]:
lat_mediana = df_medidas.iloc[0]['address_geolocation_location_lat_mediana']
lon_mediana = df_medidas.iloc[0]['address_geolocation_location_lon_mediana']

In [47]:
# Criar a nova coluna que mede a distância do imóvel para a mediana

def calculo_distancia (r):
    v_distancia = great_circle(lat_mediana, lon_mediana, r.address_geolocation_location_lat, r.address_geolocation_location_lon)
    return v_distancia

df['Distancia'] = df.apply(calculo_distancia, axis=1)

In [48]:
df.drop(['address_geolocation_location_lat', 'address_geolocation_location_lon'], axis=1, inplace=True)

In [49]:
#
# Precificacão
# 

In [50]:
# Scaling

In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit the training data
scaler.fit(df)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [52]:
# Apply the transformations to the data:
X_prod = scaler.transform(df)

In [53]:
# Linear Regression

In [54]:
from sklearn.linear_model import LinearRegression

In [55]:
lr = LinearRegression()

In [56]:
modelo_lr = pickle.load(open('modelo_lr.pickle', 'rb'))
predictions_lr = modelo_lr.predict(X_prod)

In [57]:
predictions_lr

array([-4.52352565e+14, -7.34289455e+13,  7.03593119e+14, ...,
       -7.34289456e+13, -7.11656253e+14, -4.52352563e+14])

In [58]:
# Decision Tree

In [59]:
from sklearn.tree import DecisionTreeRegressor

In [60]:
dt = DecisionTreeRegressor(max_depth=20, min_samples_split=50)

In [61]:
modelo_dr = pickle.load(open('modelo_dr.pickle', 'rb'))
predictions_dr = modelo_dr.predict(X_prod)

In [62]:
predictions_dr

array([ 340088.25      ,  217315.        ,  530154.36363636, ...,
        265499.85714286,  522799.71428571, 1749299.5       ])

In [63]:
# Random Forest

In [64]:
from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [65]:
rf = RandomForestRegressor(n_estimators=100, max_depth=20, max_features='auto')

In [66]:
modelo_rf = pickle.load(open('modelo_rf.pickle', 'rb'))
predictions_rf = modelo_rf.predict(X_prod)

In [67]:
predictions_rf

array([ 348029.27544011,  265888.38490974,  499060.35114688, ...,
        300012.14199694,  478382.9836972 , 1713918.63229744])

In [68]:
# Boosting

In [69]:
from sklearn import ensemble
params = {'n_estimators': 200, 'max_depth': 50, 'min_samples_split': 20,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

In [70]:
modelo_clf = pickle.load(open('modelo_clf.pickle', 'rb'))
predictions_clf = modelo_clf.predict(X_prod)

In [71]:
predictions_clf

array([ 364487.94723161,  276145.78469528,  487626.80366656, ...,
        295300.14417508,  500855.79022762, 1664310.69080683])

In [72]:
#
# Neural Network
#

In [73]:
from sklearn.neural_network import MLPRegressor

In [74]:
mlp = MLPRegressor(hidden_layer_sizes=(17,17), random_state=42, max_iter=200, activation='relu')

In [75]:
modelo_mlp = pickle.load(open('modelo_mlp.pickle', 'rb'))
predictions_mlp = modelo_mlp.predict(X_prod)

In [76]:
predictions_mlp

array([ 335341.89877921,  207432.64309599,  519874.81862126, ...,
        314512.2159014 ,  473086.17131659, 1540690.79506351])

In [77]:
# Rotinas para gerar o arquivo de saída CSV com os Preços de Venda estimados para cada imóvel de cada modelo utlizado

In [78]:
# CSV de precificação baseado em Regressão Linear
df_predictions_lr = pd.DataFrame(predictions_lr)
df_predictions_lr.rename({0: "price"}, axis=1, inplace=True)
df_concat = pd.concat ([df_predictions_lr, df_original], axis=1)
df_preco_venda = df_concat[['id','price']].copy()
df_preco_venda.to_csv ('predictions_lr.csv', index = False, header=True)

In [79]:
# CSV de precificação baseado em Decision Tree
df_predictions_dr = pd.DataFrame(predictions_dr)
df_predictions_dr.rename({0: "price"}, axis=1, inplace=True)
df_concat = pd.concat ([df_predictions_dr, df_original], axis=1)
df_preco_venda = df_concat[['id','price']].copy()
df_preco_venda.to_csv ('predictions_dr.csv', index = False, header=True)

In [80]:
# CSV de precificação baseado em Random Forest
df_predictions_rf = pd.DataFrame(predictions_rf)
df_predictions_rf.rename({0: "price"}, axis=1, inplace=True)
df_concat = pd.concat ([df_predictions_rf, df_original], axis=1)
df_preco_venda = df_concat[['id','price']].copy()
df_preco_venda.to_csv ('predictions_rf.csv', index = False, header=True)

In [81]:
# CSV de precificação baseado em Boosting
df_predictions_clf = pd.DataFrame(predictions_clf)
df_predictions_clf.rename({0: "price"}, axis=1, inplace=True)
df_concat = pd.concat ([df_predictions_clf, df_original], axis=1)
df_preco_venda = df_concat[['id','price']].copy()
df_preco_venda.to_csv ('predictions_clf.csv', index = False, header=True)

In [82]:
# CSV de precificação baseado em Neural Network
df_predictions_mlp = pd.DataFrame(predictions_mlp)
df_predictions_mlp.rename({0: "price"}, axis=1, inplace=True)
df_concat = pd.concat ([df_predictions_mlp, df_original], axis=1)
df_preco_venda = df_concat[['id','price']].copy()
df_preco_venda.to_csv ('predictions_mlp.csv', index = False, header=True)