In [46]:
import pandas as pd
dataset = pd.read_csv('data/processed/test.csv.gzip',compression='gzip', sep=';')

### Cleaning

##### garantindo os dtypes

In [47]:
dataset = dataset[~dataset['usableAreas'].astype(str).str.contains('[A-Za-z]', na=False)]  
dataset['usableAreas'] = dataset['usableAreas'].astype('float32')
dataset['parkingSpaces']=dataset['parkingSpaces'].astype('float32')
dataset['suites']=dataset['suites'].astype('float')
dataset['bathrooms']=dataset['bathrooms'].astype('float')
dataset['totalAreas']=dataset['totalAreas'].astype('float')
dataset['bedrooms']=dataset['bedrooms'].astype('float')
dataset['pricingInfos_yearlyIptu']=dataset['pricingInfos_yearlyIptu'].astype('float')
dataset['pricingInfos_monthlyCondoFee']= dataset['pricingInfos_monthlyCondoFee'].astype('float')

##### Funções para auxiliar na limpeza

In [48]:
def usableArea(x):
    """
    garantindo que não haja totalAreas muito distantes da usableAreas
    """
    if x['totalAreas']*3 > x['usableAreas']:
        return x['usableAreas']
    else:
        return x['totalAreas']    
    
def usableArea2(x):
    """
    olhando o preço médio de um imóvel com mais de 1500m2, nos sites de imóvel,
    percebe-se que há poucos que custam menos de 2m
    """
    if x['usableAreas'] > 1500 and x['pricingInfos_price'] < 2000000:
        return x['usableAreas']/10
    else:
        return x['totalAreas']


def randomParkingSpaces():
    """
    como que a maioria dos aps tem entre 1 e 3 vagas, vamos achar a distribuição e 
    usar como probabilidade para trazer para um valor mais realista

    assim um >>dataset['parkingSpaces'].value_counts()<< pode nos dar uma boa aproximação
    """
    p = [0.051824852372388,0.473013194963949,0.286341859391663
        ,0.104668375061677,0.060483550066054,0.017317395387334
        ,0.006350772756936]
    values = [0,1,2,3,4,5,6]
    return np.random.choice(values,1,p=p,)[0]

def valueCondoFee1(x):
    if x['pricingInfos_monthlyCondoFee']>5000 and x['pricingInfos_price']>2000000:
        return x['pricingInfos_monthlyCondoFee']/10
    else:
        return x['pricingInfos_monthlyCondoFee']
    

def valueCondoFee2(x):
    if x['pricingInfos_monthlyCondoFee']>=10000 and x['usableAreas']<200:
        return x['pricingInfos_monthlyCondoFee']/100
    else:
        return x['pricingInfos_monthlyCondoFee']
    
def zeroUsableAreas(x):
    if x['usableAreas']==0:
        if pd.notna(x['usableAreasFromTitle']) and x['usableAreasFromTitle']>0 :
            return x['usableAreasFromTitle'] 
        elif pd.notna(x['usableAreasFromDescription']) and x['usableAreasFromDescription']>0:
            return x['usableAreasFromDescription'] 
        else:
            return x['usableAreas']
    else:
        return x['usableAreas']
    
def iptu(x):

    if x['pricingInfos_yearlyIptu']<=100 or x['pricingInfos_yearlyIptu']>=50000:
        return x['pricingInfos_price']*0.01
    else:
        return x['pricingInfos_yearlyIptu']

In [49]:
dataset['totalAreas'].fillna(0, inplace = True)
dataset['totalAreas'] = dataset[['totalAreas','usableAreas']].apply(usableArea, axis=1)
dataset['usableAreas']=dataset[['usableAreas','totalAreas','pricingInfos_price']].apply(usableArea2, axis=1)
dataset['parkingSpaces'] = dataset['parkingSpaces'].apply(lambda x: randomParkingSpaces() if x>29 else x)
dataset['pricingInfos_monthlyCondoFee'] = dataset['pricingInfos_monthlyCondoFee'].apply(lambda x: x/1000 if x > 1000000 else x)
dataset['pricingInfos_monthlyCondoFee'] = dataset[['pricingInfos_monthlyCondoFee','pricingInfos_price']].apply(valueCondoFee1,axis =1)
dataset['pricingInfos_monthlyCondoFee'] = dataset[['pricingInfos_monthlyCondoFee','usableAreas']].apply(valueCondoFee2,axis =1)
dataset['pricingInfos_monthlyCondoFee'] = dataset['pricingInfos_monthlyCondoFee'].apply(lambda x: x/100 if x>100000.00 else x)
dataset['pricingInfos_monthlyCondoFee'] = dataset['pricingInfos_monthlyCondoFee'].apply(lambda x: x/10 if x>10000.00 else x)

dataset[['usableAreasFromTitle','a']]=dataset['title'].str.extract('(\d{1,4}\s?(m²|metros quadrados|metros quadrado),?)', expand=False)
dataset['usableAreasFromTitle']=dataset['usableAreasFromTitle'].str.extract(r'(\d+)', expand=False)
dataset[['usableAreasFromDescription','a']] = dataset['description'].str.extract('(\d{1,4}\s?(m²|metros quadrados|metros quadrado),?)', expand=False)
dataset['usableAreasFromDescription'] =dataset['usableAreasFromDescription'].str.extract(r'(\d+)', expand=False)
dataset['usableAreasFromTitle'].fillna(0, inplace=True)
dataset['usableAreasFromDescription'].fillna(0, inplace=True)
dataset['usableAreasFromTitle']=dataset['usableAreasFromTitle'].astype('float').apply(lambda x: 0 if x >=1000 else x)
dataset['usableAreasFromDescription']=dataset['usableAreasFromDescription'].astype('float').apply(lambda x: 0 if x >=1000 else x)
dataset['usableAreas']=dataset['usableAreas'].fillna(dataset['usableAreasFromDescription'])
dataset['usableAreas']=dataset['usableAreas'].fillna(dataset['usableAreasFromTitle'])
dataset['usableAreas']=dataset['usableAreas'].fillna(dataset['totalAreas'])
dataset['usableAreas']=dataset[['usableAreas','usableAreasFromTitle','usableAreasFromDescription']].apply(zeroUsableAreas, axis=1 )
dataset['usableAreas']=dataset.usableAreas.astype('float')

dataset[['vagaFromTitle', 'b']]= dataset['title'].str.extract('(\s?\d{1,2}\s?(vaga|vagas|estacionamento))', expand=False)

interesting_string='trem|estação|metrô|museu|avenida|av|parques|parque|marginal|shopping|perto do metro'
dataset['descriptionAndTitle']=dataset['description']+' '+dataset['title']
dataset['interestingFlag']=dataset['descriptionAndTitle'].str.contains(interesting_string, regex=True)

dataset['address_zone']=dataset['address_locationId'].str.split('>').str[4]

dataset['pricingInfos_yearlyIptu']=dataset['pricingInfos_yearlyIptu'].apply(lambda x: x/10 if x>100000 else x )
#fillna com mediana
dataset['pricingInfos_yearlyIptu']=dataset.pricingInfos_yearlyIptu.fillna(2380)
dataset['pricingInfos_yearlyIptu']=dataset[['pricingInfos_yearlyIptu','pricingInfos_price']].apply(iptu ,axis=1)
dataset['pricingInfos_yearlyIptu']=dataset.pricingInfos_yearlyIptu.fillna(2380)

dataset['usableAreas_flg']=dataset['usableAreas'].apply(lambda x: 1 if x>0 else 0)
dataset['interestingFlag']=dataset['interestingFlag'].apply(lambda x: 1 if x else 0)
dataset.interestingFlag.fillna(0, inplace=True)

#vamos dar fillna com a mediana, assim garantimos menor distorção 
dataset['parkingSpaces'].fillna(1,inplace = True )
dataset['suites'].fillna(1,inplace = True )
dataset['bedrooms'].fillna(3,inplace = True )
dataset['bathrooms'].fillna(2,inplace = True )
dataset['pricingInfos_monthlyCondoFee'].fillna(482, inplace = True )

dataset['usableAreas_flg']=dataset['usableAreas'].apply(lambda x: 1 if x>0 else 0)
dataset['interestingFlag']=dataset['interestingFlag'].apply(lambda x: 1 if x else 0)
dataset.interestingFlag.fillna(0, inplace=True)



In [50]:
from sklearn import preprocessing
dataset=pd.get_dummies(dataset, columns=["address_zone"])
    
x_col=['usableAreas', 'parkingSpaces', 'suites', 'bedrooms'
,'bathrooms', 'pricingInfos_yearlyIptu', 'pricingInfos_monthlyCondoFee','interestingFlag'
, 'address_zone_Centro','address_zone_Zona Leste','address_zone_Zona Oeste', 'address_zone_Zona Sul']

In [51]:
import pickle
with open('model/RandomForestRegressor.sav', 'rb') as model:
    rfg = pickle.load(model)

In [52]:
results_df=pd.DataFrame({'id':dataset.id.values,'price':rfg.predict(dataset[x_col])})
results_df.to_csv('predictions.csv', index = False)