In [268]:
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor 
from statsmodels.tools.tools import add_constant

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = '{:,.2f}'.format

from sklearn.model_selection import train_test_split


seed=42
np.random.seed = 42

In [269]:
def calculate_vif_(df, thresh=5):
    '''
    Calculates VIF each feature in a pandas dataframe
    A constant must be added to variance_inflation_factor or the results will be incorrect

    :param df: the pandas dataframe containing only the predictor features, not the response variable
    :param thresh: the max VIF value before the feature is removed from the dataframe
    :return: dataframe with features removed
    '''
    const=add_constant(df)
    cols=const.columns
    variables=np.arange(const.shape[1])
    vif_df=pd.Series([variance_inflation_factor(const.values, i) 
               for i in range(const.shape[1])], 
              index=const.columns).to_frame()

    vif_df=vif_df.sort_values(by=0, ascending=False).rename(columns={0: 'VIF'})
    vif_df=vif_df.drop('const')
    vif_df=vif_df[vif_df['VIF'] > thresh]

    print('Features above VIF threshold:\n')
    print(vif_df[vif_df['VIF'] > thresh])

    col_to_drop = list(vif_df.index)

    for i in col_to_drop:
        print('Dropping: {}'.format(i))
        df = df.drop(columns=i)

    return 'done'

In [270]:
#columns
dtype_list = ['usableAreas','totalAreas','parkingSpaces','suites','bedrooms'
,'pricingInfos_price','pricingInfos_yearlyIptu','pricingInfos_monthlyCondoFee']

dataset['interestingFlag'] = dataset['interestingFlag'].map({True:1, False:0}).astype('float')

In [271]:
#reading
dataset = pd.read_csv('data/processed/train_cleaned.csv.gzip',compression='gzip', sep =';')
dataset[[x for x in dtype_list]] = dataset[[x for x in dtype_list]].astype('float')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [272]:
#olhando as features 
_=dataset.address_neighborhood.nunique()/dataset[dataset.usableAreas!=0].shape[0]
print(f'razão: # de bairros/# de linhas = {_:.2f}')

razão: # de bairros/# de linhas = 0.03


In [273]:
_=dataset[dataset.usableAreas==0]['id'].count()/dataset.shape[0]
print(f'razão: # de área igual à 0/# de linhas = {_:.2f}')

razão: # de área igual à 0/# de linhas = 0.37


In [274]:
#olhando a colireariedade, nos dados contínuos, e torcendo
aux_list = ['usableAreas','parkingSpaces','suites','bedrooms'
,'pricingInfos_yearlyIptu','pricingInfos_monthlyCondoFee']
aux_df=dataset[(dataset['usableAreas']>0)&(dataset['pricingInfos_yearlyIptu']>0)
               &(dataset['pricingInfos_monthlyCondoFee']>0)]

calculate_vif_(aux_df[aux_list],5)

Features above VIF threshold:

Empty DataFrame
Columns: [VIF]
Index: []


'done'

In [275]:
dataset['usableAreas_flg']=dataset['usableAreas'].apply(lambda x: 1 if x>0 else 0)

#### Train/test

In [276]:
from sklearn import preprocessing
dataset.interestingFlag.fillna(0, inplace=True)
dataset=pd.get_dummies(dataset, columns=["address_zone"])

x_col=['usableAreas', 'totalAreas', 'parkingSpaces', 'suites', 'bedrooms'
,'bathrooms', 'pricingInfos_yearlyIptu', 'pricingInfos_monthlyCondoFee'
,'interestingFlag', 'usableAreas_flg', 'address_zone_Centro'
,'address_zone_Zona Leste','address_zone_Zona Oeste', 'address_zone_Zona Sul']

x=dataset[x_col]
y =dataset['pricingInfos_price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=seed)

x_train = preprocessing.scale(x_train)
x_test = preprocessing.scale(x_test)

In [266]:
x_test = pd.DataFrame(x_test, columns=x_col)
x_train = pd.DataFrame(x_train, columns=x_col)

#### Regression

In [288]:
import statsmodels.api as sm

x = x_train
y = y_train

x2 = sm.add_constant(pd.DataFrame(x_train, columns=x_col))
est = sm.OLS(pd.DataFrame(y_train.values, columns=['price']), x2)
est = est.fit()

In [289]:
est.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.717
Model:,OLS,Adj. R-squared:,0.717
Method:,Least Squares,F-statistic:,7783.0
Date:,"Mon, 12 Oct 2020",Prob (F-statistic):,0.0
Time:,23:28:46,Log-Likelihood:,-619590.0
No. Observations:,42942,AIC:,1239000.0
Df Residuals:,42927,BIC:,1239000.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.408e+05,2155.890,343.639,0.000,7.37e+05,7.45e+05
usableAreas,2.89e+05,6430.499,44.934,0.000,2.76e+05,3.02e+05
totalAreas,-7.739e+04,5626.499,-13.755,0.000,-8.84e+04,-6.64e+04
parkingSpaces,1.564e+05,3666.376,42.647,0.000,1.49e+05,1.64e+05
suites,7.754e+04,3782.025,20.503,0.000,7.01e+04,8.5e+04
bedrooms,-2.229e+04,3022.267,-7.374,0.000,-2.82e+04,-1.64e+04
bathrooms,4.779e+04,3503.097,13.642,0.000,4.09e+04,5.47e+04
pricingInfos_yearlyIptu,2.651e+05,2419.132,109.565,0.000,2.6e+05,2.7e+05
pricingInfos_monthlyCondoFee,2.758e+05,2968.358,92.921,0.000,2.7e+05,2.82e+05

0,1,2,3
Omnibus:,38356.499,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3872808.261
Skew:,3.888,Prob(JB):,0.0
Kurtosis:,48.87,Cond. No.,7.95


In [230]:
est.params

const    740,848.95
x1       288,950.38
x2       -77,390.09
x3       156,358.57
x4        77,541.93
x5       -22,287.05
x6        47,789.09
x7       265,052.60
x8       275,823.72
x9         5,840.71
x10     -121,513.68
x11       78,511.96
x12       12,714.68
x13       44,286.24
x14       86,403.71
dtype: float64