In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

sns.set(palette='Set2')

In [None]:
df = pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent_v2.csv')
df.head()

In [None]:
# Change columns names
df.columns = ['city','area','rooms','bathroom','parking','floor','animal','furniture','hoa','rent_amount',
             'property_tax','fire_insurance','total']
df.info()

In [None]:
# Correct floor variable
df.loc[df.floor=='-','floor'] = 0
df.floor = df.floor.astype(int)

df.describe()

# Let's have a closer look to each variable

## Rent amount

I chose to focus on the rent amount and not the total price. Total price only sums condominium, taxes and fire insurance prices to rent amount which are variables that are generally explained by the rent and thus are endogenous. This could cause multicollinearity and violate OLS assumptions.

In [None]:
print(df.rent_amount.describe())
plt.figure(figsize=(20,5))
plt.subplot(121)
sns.distplot(df.rent_amount)

plt.subplot(122)
sns.boxplot(df.rent_amount)
plt.show()

There are extreme values and as expected, we reject the normality since test statistic is higher than 1% critical value.
Let's try with the log function.

In [None]:
df['log_rent'] = np.log(df.rent_amount)

plt.figure(figsize=(20,5))
plt.subplot(121)
sns.distplot(df.log_rent,fit=norm)

plt.subplot(122)
sns.boxplot(df.log_rent)
plt.show()

In [None]:
df.loc[df.log_rent>10]

In [None]:
# We should remove observations with log rent > 10
dfclean = df.loc[df.log_rent<=10].copy()

## Continuous variables

We will consider as outliers the observations that are at least 50% of interquartile range above 3rd quartile.

In [None]:
# For continuous variables we look at interquartile range and boxplots
dfclean['log_area'] = np.log(dfclean.area)
cols = ['area','log_area','floor'] # continuous variables
q1 = dfclean[cols].quantile(0.25)
q3 = dfclean[cols].quantile(0.75)
itq = q3-q1
mins = pd.DataFrame(q1-1.5*itq) 
maxs = pd.DataFrame(q3+1.5*itq) 
maxs

In [None]:
def look_cont_variable(variable):
    print(dfclean[variable].describe())
    
    plt.figure(figsize=(20,10))
    plt.subplot(221)
    sns.boxplot(dfclean[variable])
    plt.title(f'Distribution of {variable}',fontsize=20)
    
    plt.subplot(222)
    sns.boxplot(dfclean.loc[(dfclean[variable]<=float(maxs.loc[maxs.index==variable].values))&
                           (dfclean[variable]>=float(mins.loc[mins.index==variable].values))][variable])
    plt.title(f'Distribution of {variable} without outliers',fontsize=20)
    
    plt.subplot(223)
    sns.distplot(dfclean[variable],fit=norm)
    plt.title(f'Distribution of {variable}',fontsize=20)
    
    plt.subplot(224)
    sns.distplot(dfclean.loc[(dfclean[variable]<=float(maxs.loc[maxs.index==variable].values))&
                           (dfclean[variable]>=float(mins.loc[mins.index==variable].values))][variable],
                fit=norm)
    plt.title(f'Distribution of {variable} without outliers',fontsize=20)
    plt.tight_layout()
    plt.show()

In [None]:
look_cont_variable('area')

In [None]:
look_cont_variable('log_area')

In [None]:
look_cont_variable('floor')

In [None]:
# We remove outliers and see how many rows we have left
def remove_out(df,col):
    q3 = df[col].quantile(0.75)
    q1 = df[col].quantile(0.25)
    m = q3+(q3-q1)*1.5
    dfout = df.loc[df[col]<=m]
    return dfout

for col in maxs.index:
    dfclean = remove_out(dfclean,col)
    
print(f'Dataframe has now {dfclean.shape[0]} rows\nWe lose {df.shape[0]-dfclean.shape[0]} rows ({np.round((df.shape[0]-dfclean.shape[0])/dfclean.shape[0],2)*100}%)')

## Categorical variables

In [None]:
def look_cat_variable(variable):
    print(dfclean[variable].value_counts())
    dfclean[variable].value_counts().plot.bar()
    plt.show()

In [None]:
look_cat_variable('city')

More than half of observations are in Sao Paulo.

In [None]:
look_cat_variable('rooms')

In [None]:
look_cat_variable('bathroom')

In [None]:
look_cat_variable('parking')

In [None]:
look_cat_variable('animal')

In [None]:
look_cat_variable('furniture')

In [None]:
# We will keep classes that have more than 100 ocurrencies
dfclean = dfclean.groupby('rooms').filter(lambda x: len(x)>=100)
dfclean = dfclean.groupby('bathroom').filter(lambda x: len(x)>=100)
dfclean = dfclean.groupby('parking').filter(lambda x: len(x)>=100)
print(f'Dataframe has now {dfclean.shape[0]} rows\nWe lose {df.shape[0]-dfclean.shape[0]} rows in total ({np.round((df.shape[0]-dfclean.shape[0])/dfclean.shape[0],2)*100}%)')

# Relationship of each variable with rent amount

## Continuous variables

In [None]:
sns.pairplot(dfclean[['log_rent','log_area','floor']])
plt.show()

In [None]:
cols = ['log_rent','log_area','floor']
plt.figure(figsize=(7,7))
sns.heatmap(np.corrcoef(dfclean[cols],rowvar=False),annot=True,xticklabels=cols,yticklabels=cols)
plt.show()

There is a positive correlation between the rent and the area. On the contrary, there doesn't seem the be any relationship with the floor. 

## Discrete variables

We will visualise the relationship with a boxplot. Since ANOVA requires normally distribution within all groups and homoskedasticity, we will rely on the non parametric test Kruskal-Wallis H-test that tests the equality of medians accross groups.
The if p-value < 0.01 we can consider that at least one group has a diferent median from the others.

In [None]:
from scipy.stats import kruskal

def plot_relation_discrete(x,y):
    plt.figure(figsize=(15,5))
    medians = pd.DataFrame(dfclean.groupby(x)[y].median().sort_values())\
        .merge(pd.DataFrame(dfclean[x].value_counts()),left_index=True,right_index=True)
    sns.boxplot(dfclean[x],dfclean[y],order=medians.index)
    for i in range(0,medians.shape[0]):
        plt.text(x=i,y=medians.iloc[i,0],s='n='+str(medians.iloc[i,1]),ha='center')
    plt.xlabel(x,fontsize=20)
    plt.xticks(fontsize=15)
    plt.show()

In [None]:
df1 = dfclean.loc[dfclean.city=='Campinas'][['log_rent']]
df2 = dfclean.loc[dfclean.city=='Porto Alegre'][['log_rent']]
df3 = dfclean.loc[dfclean.city=='Belo Horizonte'][['log_rent']]
df4 = dfclean.loc[dfclean.city=='Rio de Janeiro'][['log_rent']]
df5 = dfclean.loc[dfclean.city=='São Paulo'][['log_rent']]
print(kruskal(df1,df2,df3,df4,df5))

plot_relation_discrete('city','log_rent')

Rent amounts are quite dispersed within each group. Even though Sao Paulo has the highest median rent, it doesn't have big diferences with Belo Horizonte and Rio de Janeiro. Campinas and Porto Alegre seem to be cheaper than the other cities but we can see some outliers among them.

In [None]:
df1 = dfclean.loc[dfclean.rooms==1][['log_rent']]
df2 = dfclean.loc[dfclean.rooms==2][['log_rent']]
df3 = dfclean.loc[dfclean.rooms==3][['log_rent']]
df4 = dfclean.loc[dfclean.rooms==4][['log_rent']]
df5 = dfclean.loc[dfclean.rooms==5][['log_rent']]
print(kruskal(df1,df2,df3,df4,df5))

plot_relation_discrete('rooms','log_rent')

As we could expect, the median rent is increasing with the number of rooms, the rents with one or two rooms are the cheaper ones. There is no major diference in rent amount between 4 and 5 rooms.

In [None]:
df1 = dfclean.loc[dfclean.rooms==1][['log_area']]
df2 = dfclean.loc[dfclean.rooms==2][['log_area']]
df3 = dfclean.loc[dfclean.rooms==3][['log_area']]
df4 = dfclean.loc[dfclean.rooms==4][['log_area']]
df5 = dfclean.loc[dfclean.rooms==5][['log_area']]
print(kruskal(df1,df2,df3,df4,df5))

plot_relation_discrete('rooms','log_area')

Rooms and area are also correlated: the bigger the area the more rooms there should be. We note lot of outliers 

In [None]:
df1 = dfclean.loc[dfclean.bathroom==1][['log_rent']]
df2 = dfclean.loc[dfclean.bathroom==2][['log_rent']]
df3 = dfclean.loc[dfclean.bathroom==3][['log_rent']]
df4 = dfclean.loc[dfclean.bathroom==4][['log_rent']]
df5 = dfclean.loc[dfclean.bathroom==5][['log_rent']]
print(kruskal(df1,df2,df3,df4,df5))

plot_relation_discrete('bathroom','log_rent')

In [None]:
pd.crosstab(dfclean.bathroom,dfclean.rooms).plot.bar(figsize=(15,5))
plt.xticks(rotation=0)
plt.ylabel('Count')
plt.show()

The number of rooms and bathroom is strongly related, the rents with less bathrooms being the ones with less rooms.
Let's look at the relationship between area, rent and number of rooms and bathrooms.

In [None]:
plt.figure(figsize=(15,20))
plt.subplot(211)
sns.scatterplot(dfclean.log_area,dfclean.log_rent,hue=dfclean.bathroom.astype('category'),alpha=.7)
plt.subplot(212)
sns.scatterplot(dfclean.log_area,dfclean.log_rent,hue=dfclean.rooms.astype('category'),alpha=.7)
plt.show()

We can see that the the three variables are positively correlated: the higher the area and the rent, the higher the number of rooms and bathrooms.

In [None]:
df1 = dfclean.loc[dfclean.parking==1][['log_rent']]
df2 = dfclean.loc[dfclean.parking==2][['log_rent']]
df3 = dfclean.loc[dfclean.parking==3][['log_rent']]
df4 = dfclean.loc[dfclean.parking==4][['log_rent']]
df5 = dfclean.loc[dfclean.parking==5][['log_rent']]
print(kruskal(df1,df2,df3,df4,df5))

plot_relation_discrete('parking','log_rent')

In [None]:
df1 = dfclean.loc[dfclean.animal=='acept'][['log_rent']]
df2 = dfclean.loc[dfclean.animal=='not acept'][['log_rent']]
print(kruskal(df1,df2))

plot_relation_discrete('animal','log_rent')

In [None]:
df1 = dfclean.loc[dfclean.furniture=='furnished'][['log_rent']]
df2 = dfclean.loc[dfclean.furniture=='not furnished'][['log_rent']]
print(kruskal(df1,df2))

plot_relation_discrete('furniture','log_rent')

# OLS Regression

In [None]:
import statsmodels.api as sm

dfclean['rooms'] = dfclean.rooms.astype('category')
dfclean['bathroom'] = dfclean.bathroom.astype('category')
dfclean['parking'] = dfclean.parking.astype('category')
dfclean['city'] = dfclean.city.astype('category')
dfclean['animal'] = dfclean.animal.astype('category')
dfclean['furniture'] = dfclean.furniture.astype('category')

dfreg = dfclean[['log_rent','log_area','floor','bathroom','rooms','parking','animal','furniture','city']].copy()

dfreg['int_area_room'] = dfreg.rooms.astype(int)*dfreg.log_area
dfreg['int_area_bathroom'] = dfreg.bathroom.astype(int)*dfreg.log_area

dfreg.info()

In [None]:
X = dfreg.drop('log_rent',1)
X_dummies = pd.get_dummies(X,drop_first=True)
Y = dfreg.log_rent

ols = sm.regression.linear_model.OLS(Y,sm.add_constant(X_dummies)).fit()
print(ols.get_robustcov_results().summary())

As expected, we have multicollinearity among exogenous variables. This is probably due to the presence of both rooms and bathrooms. Multicollinearity gives false estimations of the parameters, the effect of one variable being captured by the other.
To solve that, we will remove the number of rooms and keep only the number of bathrooms.

In [None]:
dfreg = dfclean[['log_rent','log_area','floor','bathroom','parking','animal','furniture','city']]

X = dfreg.drop('log_rent',1)
X_dummies = pd.get_dummies(X,drop_first=True)
Y = dfreg.log_rent

ols = sm.regression.linear_model.OLS(Y,sm.add_constant(X_dummies)).fit()
print(ols.get_robustcov_results().summary())

In [None]:
fig = plt.figure(figsize=(20,10))

ax1 = plt.subplot(121)
ax1.scatter(ols.fittedvalues,ols.resid)
ax1.axhline(y=0,ls='--',color='black')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')

ax2 = plt.subplot(122)
ax2.scatter(ols.fittedvalues,Y)
ax2.plot(np.arange(min(Y),max(ols.fittedvalues)),
         np.arange(min(Y),max(ols.fittedvalues)),color='black')
plt.xlabel('Fitted values')
plt.ylabel('True values')
plt.show()

fig = plt.figure(figsize=(20,10))
ax3 = plt.subplot(121)
sns.distplot(ols.resid,ax=ax3,fit=norm)
plt.xlabel('Residuals')

ax4 = plt.subplot(122)
sm.qqplot(ols.resid,ax=ax4,line='s')

plt.show()

The fitted values are too big for low rents and too small for high rents.
There is also heteroskedasticity among the errors: their variance is not constant.

## Interpretation

We interpret the effect of one exogenous variable on the dependant variable by keeping all other variables constant.
All variables except Porto Alegre dummy are significant at 1%.
- Area: increasing the area by 1% leads in average to an increase of 0.45% of rent amount.
- Floor: the higher the department the more expensive it is. One upper floor brings 1% more to rent amount.
This interpretation can be biased since we didn't control for the presence of houses.
- Bathroom: We interpret with respect to having one bathroom. As we could have expected, the more bathrooms we have the higher the diference with having one bathroom. Places with two bathrooms are in average 17% more expensive than places with a single bathroom. The difference is 46% with five bathrooms. We note that the diference is decreasing with the number of bathrooms.
- Parking: Same reasoning as bathroom variable. The more parking spots there is the higher the rent diference with not having any parking spot.
- Animal: Refusing animals increases rent amount by 4% in average.
- Furniture: An empty places makes the rent amount decrease by 38%.
- City: we compare with the city of Belo Horizonte. Living in Cantinas decreases in average the rent by 16% with respect to Belo Horizonte. On the contrary, living in Rio de Janeiro or Sao Paulo increases the rent by 27 and 33% respectively.