Hello everyone :)

The first part of this notebook contains data exploration and data engineering. The second and last part contains the machine leaning models, models evaluation and total price predictions.

I hope you enjoy it and feel free leave a comment, any feedback is welcome!!


Models used:

* Decision Tree Regression;
* Random Forest Regression;
* Linear Regression;
* SVM Regression;
* K Nearest Neighbour Regression;
* Lasso Regression;
* Ridge Regression;

In [None]:
#import packages and dataset
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats
from scipy.stats import norm, skew
import sklearn.metrics as metrics
import os

df= pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent_v2.csv')

In [None]:
#Let's check for missing data
df.isnull().sum()

In [None]:
#totalprice correlation matrix
k = 10 #number of variables for heatmap
plt.figure(figsize=(16,8))
corrmat = df.corr()
# picking the top 15 correlated features
cols = corrmat.nlargest(k, 'total (R$)')['total (R$)'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()


In [None]:
#finding outliers
fig, ax = plt.subplots()
ax.scatter(x = df['hoa (R$)'], y = df['total (R$)'])
plt.ylabel('price', fontsize=13)
plt.xlabel('hora', fontsize=13)
plt.show()

In [None]:
#Deleting outliers
df= df.drop(df[(df['hoa (R$)']>400000) & (df['total (R$)']>800000)].index)


In [None]:
#checking for outliers again
fig, ax = plt.subplots()
ax.scatter(x = df['hoa (R$)'], y = df['total (R$)'])
plt.ylabel('price', fontsize=13)
plt.xlabel('hora', fontsize=13)
plt.show()

In [None]:
#deleting outliers
df= df.drop(df[(df['hoa (R$)']>100000) & (df['total (R$)']>200000)].index)

In [None]:
#finding outliers
fig, ax = plt.subplots()
ax.scatter(x = df['hoa (R$)'], y = df['total (R$)'])
plt.ylabel('price', fontsize=13)
plt.xlabel('hora', fontsize=13)
plt.show()

In [None]:
#deleting outliers
df= df.drop(df[(df['hoa (R$)']>60000) & (df['total (R$)']>90000)].index)

In [None]:
#finding outliers
fig, ax = plt.subplots()
ax.scatter(x = df['hoa (R$)'], y = df['total (R$)'])
plt.ylabel('price', fontsize=13)
plt.xlabel('hora', fontsize=13)
plt.show()

In [None]:
#deleting outliers
df= df.drop(df[(df['total (R$)']>300000)].index)
df= df.drop(df[(df['hoa (R$)']>30000)].index)

In [None]:
#finding outliers
fig, ax = plt.subplots()
ax.scatter(x = df['hoa (R$)'], y = df['total (R$)'])
plt.ylabel('price', fontsize=13)
plt.xlabel('hora', fontsize=13)
plt.show()

In [None]:
#target variable- sale price
sns.distplot((df['total (R$)']), fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit((df['total (R$)']))
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(df['total (R$)'], plot=plt)
plt.show()


In [None]:
sns.pairplot(df)

In [None]:
#hora x total
sns.lmplot(x='hoa (R$)',y='total (R$)',data=df) #hour is very correlated to total price.


In [None]:
plt.figure(figsize=(13,8))
sns.boxplot(x= 'bathroom',y='total (R$)',data=df)
plt.show()

In [None]:
plt.figure(figsize=(13,8))
sns.boxplot(x= 'rooms',y='total (R$)',data=df)
plt.show()

In [None]:
#histogram of the number of rooms
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.hist(df['bathroom'])
plt.title("number of rooms")
plt.xlabel("quantity")
plt.ylabel("number of rooms")
plt.grid()
plt.show()


In [None]:
#casas x preço
plt.scatter(df['area'],df['total (R$)'])
plt.title("area x price")
plt.xlabel("area")
plt.ylabel("price")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x="city", y= 'rooms', palette=["m", "g"], data=df)
plt.title('City and number of rooms')

In [None]:
plt.figure(figsize=(13,8))
sns.boxplot(x= 'city',y='total (R$)',data=df)
plt.show()

In [None]:
sns.countplot(df['animal'],hue = df['city']).set_title('animals allowed per city')

In [None]:
sns.violinplot(x ='furniture', y ='rent amount (R$)', data = df, hue ='city').set_title=("furniture per city and total price")

In [None]:
#parking spaces
plt.figure(figsize =(6,6))
plt.subplot(2,1,1)
ax = sns.regplot(df['parking spaces'],df['rent amount (R$)'])
plt.subplot(2,1,2)
sns.distplot(df['parking spaces'],kde =False)

In [None]:
#fire insurance x total price per city
plt.figure(figsize =(12,6))
sns.violinplot(x ='city', y ='fire insurance (R$)', data = df,hue ='city')


In [None]:
#fire insurance is very related to total price
ax = sns.regplot(df['fire insurance (R$)'],df['rent amount (R$)'])


In [None]:
# Categorical boolean mask
categorical_feature_mask = df.dtypes==object
# filter categorical columns using mask and turn it into alist
categorical_cols = df.columns[categorical_feature_mask].tolist()


from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df[categorical_cols] = df[categorical_cols].apply(lambda col: labelencoder.fit_transform(col.astype(str)))


In [None]:
#selecting dependent and independent variables
X= df.drop(["total (R$)"], axis=1)
y= df.loc[:,["total (R$)"]]


In [None]:
#split the dataset
from sklearn.model_selection import train_test_split as tts
X_train,X_test,y_train,y_test = tts(X,y,test_size =0.3)

In [None]:
#building the machine learning models
acc= []


#Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor as regr
model =regr()
model.fit(X_train,y_train)
from sklearn.metrics import r2_score
print(r2_score(y_test,model.predict(X_test)))
acc.append(['DTR',r2_score(y_test,model.predict(X_test))])


#Random Forest Regression
from sklearn.ensemble import RandomForestRegressor as regr
model =regr()
model.fit(X_train,y_train)
print(r2_score(y_test,model.predict(X_test)))
acc.append(['RFN',r2_score(y_test,model.predict(X_test))])


#Linear regression
from sklearn.linear_model import LinearRegression as regr
model =regr()
model.fit(X_train,y_train)
print(r2_score(y_test,model.predict(X_test)))
acc.append(['LIR',r2_score(y_test,model.predict(X_test))])


#SVM Regression
from sklearn.svm import SVR as regr
model =regr()
model.fit(X_train,y_train)
print(r2_score(y_test,model.predict(X_test)))
acc.append(['SVM',r2_score(y_test,model.predict(X_test))])



#K Nearest Neighbour Regression
from sklearn.neighbors import KNeighborsRegressor as regr
model =regr()
model.fit(X_train,y_train)
print(r2_score(y_test,model.predict(X_test)))
acc.append(['KNNR',r2_score(y_test,model.predict(X_test))])

#Lasso Regression
from sklearn.linear_model import Lasso as regr
model =regr()
model.fit(X_train,y_train)
print(r2_score(y_test,model.predict(X_test)))
acc.append(['LaR',r2_score(y_test,model.predict(X_test))])

#Ridge Regression
from sklearn.linear_model import Ridge as regr
model =regr()
model.fit(X_train,y_train)
print(r2_score(y_test,model.predict(X_test)))
acc.append(['RiR',r2_score(y_test,model.predict(X_test))])


#Different Algorithms and their performance
acc.sort(key = lambda y:y[1],reverse =True)


In [None]:
#print all the models accurancy score
print(acc)

In [None]:
#As the RiR tops the list we will use it as our final model!!!
from sklearn.linear_model import Ridge as regr
model =regr()
model.fit(X_train,y_train)

In [None]:
#making the predictions
y_pred = model.predict(X_test)

In [None]:
#ploting the model prediction with the y_test values the check the model prediction power
ax1 = sns.distplot(y_test,hist=False,kde =True,color ="r",label ="Actual Value")
sns.distplot(model.predict(X_test),color ="b",hist = False,kde =True, label = "Preicted Value",ax =ax1)
