In [196]:
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor 
from statsmodels.tools.tools import add_constant

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = '{:,.2f}'.format

from sklearn.model_selection import train_test_split

seed=42
np.random.seed = 42

In [197]:
def calculate_vif_(df, thresh=5):
    '''
    Calculates VIF each feature in a pandas dataframe
    A constant must be added to variance_inflation_factor or the results will be incorrect

    :param df: the pandas dataframe containing only the predictor features, not the response variable
    :param thresh: the max VIF value before the feature is removed from the dataframe
    :return: dataframe with features removed
    '''
    const=add_constant(df)
    cols=const.columns
    variables=np.arange(const.shape[1])
    vif_df=pd.Series([variance_inflation_factor(const.values, i) 
               for i in range(const.shape[1])], 
              index=const.columns).to_frame()

    vif_df=vif_df.sort_values(by=0, ascending=False).rename(columns={0: 'VIF'})
    vif_df=vif_df.drop('const')
    vif_df=vif_df[vif_df['VIF'] > thresh]

    print('Features above VIF threshold:\n')
    print(vif_df[vif_df['VIF'] > thresh])

    col_to_drop = list(vif_df.index)

    for i in col_to_drop:
        print('Dropping: {}'.format(i))
        df = df.drop(columns=i)

    return 'done'

In [198]:
#columns
dtype_list = ['usableAreas','totalAreas','parkingSpaces','suites','bedrooms'
,'pricingInfos_price','pricingInfos_yearlyIptu','pricingInfos_monthlyCondoFee']

dataset['interestingFlag'] = dataset['interestingFlag'].map({True:1, False:0}).astype('float')

In [199]:
#reading
dataset = pd.read_csv('data/processed/train_cleaned.csv.gzip',compression='gzip', sep =';')
dataset[[x for x in dtype_list]] = dataset[[x for x in dtype_list]].astype('float')

  interactivity=interactivity, compiler=compiler, result=result)


In [200]:
#olhando as features 
_=dataset.address_neighborhood.nunique()/dataset[dataset.usableAreas!=0].shape[0]
print(f'razão: # de bairros/# de linhas = {_:.2f}')

razão: # de bairros/# de linhas = 0.03


In [201]:
_=dataset[dataset.usableAreas==0]['id'].count()/dataset.shape[0]
print(f'razão: # de área igual à 0/# de linhas = {_:.2f}')

razão: # de área igual à 0/# de linhas = 0.37


### olhando a colireariedade, nos dados contínuos
e torcendo

In [202]:
aux_list = ['usableAreas','parkingSpaces','suites','bedrooms'
,'pricingInfos_yearlyIptu','pricingInfos_monthlyCondoFee']
aux_df=dataset[(dataset['usableAreas']>0)&(dataset['pricingInfos_yearlyIptu']>0)
               &(dataset['pricingInfos_monthlyCondoFee']>0)]

calculate_vif_(aux_df[aux_list],5)

Features above VIF threshold:

Empty DataFrame
Columns: [VIF]
Index: []


'done'

In [203]:
dataset['usableAreas_flg']=dataset['usableAreas'].apply(lambda x: 1 if x>0 else 0)
dataset.interestingFlag.fillna(0, inplace=True)

#### Train/test

In [204]:
from sklearn import preprocessing

dataset=dataset[(dataset['usableAreas']>5)&(dataset['pricingInfos_price']<1000000)]
dataset=pd.get_dummies(dataset, columns=["address_zone"])

x_col=['usableAreas', 'parkingSpaces', 'suites', 'bedrooms'
,'bathrooms', 'pricingInfos_yearlyIptu', 'pricingInfos_monthlyCondoFee'
,'usableAreas_flg', 'address_zone_Centro'
,'address_zone_Zona Leste','address_zone_Zona Oeste', 'address_zone_Zona Sul']

x=dataset[x_col]
y =dataset['pricingInfos_price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

x_train_sc = preprocessing.scale(x_train)
x_test_sc = preprocessing.scale(x_test)

#### Regression

In [205]:
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error,mean_absolute_error

x = sm.add_constant(pd.DataFrame(x_train_sc, columns=x_col))
est = sm.OLS(pd.DataFrame(y_train.values, columns=['price']), x)
est = est.fit()
est.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.658
Model:,OLS,Adj. R-squared:,0.658
Method:,Least Squares,F-statistic:,4448.0
Date:,"Tue, 13 Oct 2020",Prob (F-statistic):,0.0
Time:,13:29:07,Log-Likelihood:,-336540.0
No. Observations:,25484,AIC:,673100.0
Df Residuals:,25472,BIC:,673200.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.32e+05,824.284,524.054,0.000,4.3e+05,4.34e+05
usableAreas,8.12e+04,1372.582,59.160,0.000,7.85e+04,8.39e+04
parkingSpaces,2.043e+04,1142.092,17.886,0.000,1.82e+04,2.27e+04
suites,2.773e+04,1184.032,23.422,0.000,2.54e+04,3.01e+04
bedrooms,-6755.6300,1127.813,-5.990,0.000,-8966.208,-4545.052
bathrooms,2.979e+04,1205.210,24.718,0.000,2.74e+04,3.22e+04
pricingInfos_yearlyIptu,3.484e+04,867.891,40.146,0.000,3.31e+04,3.65e+04
pricingInfos_monthlyCondoFee,5.365e+04,1005.918,53.337,0.000,5.17e+04,5.56e+04
usableAreas_flg,-2.216e-11,6.27e-13,-35.335,0.000,-2.34e-11,-2.09e-11

0,1,2,3
Omnibus:,2510.609,Durbin-Watson:,1.991
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18379.848
Skew:,0.158,Prob(JB):,0.0
Kurtosis:,7.148,Cond. No.,2.57e+16


In [217]:
_ = sm.add_constant(pd.DataFrame(x_test_sc, columns=x_col))
y_pred = est.predict(exog=_)

mse =mean_squared_error(y_test, y_pred,squared=False)
mae=mean_absolute_error(y_test, y_pred)
print(f'mse:{mse}\nmae:{mae}')

mse:132123.1869151812
mae:96695.73010973803
