# Prb Statement - Predicting the house Price

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("Maison.csv")

In [3]:
# Since the columns are in french, in order to make them more readable, let's translate them into English
df = df.rename(index = str, columns = {'PRIX':'price','SUPERFICIE': 'area','CHAMBRES': 'rooms', 
                         'SDB': 'bathroom', 'ETAGES': 'floors','ALLEE': 'driveway',
                         'SALLEJEU':'game_room', 'CAVE': 'cellar', 
                         'GAZ': 'gas', 'AIR':'air', 'GARAGES': 'garage', 'SITUATION': 'situation'})

In [4]:
df.head()

Unnamed: 0,price,area,rooms,bathroom,floors,driveway,game_room,cellar,gas,air,garage,situation
0,42000,5850,3,1,2,1,0,1,0,0,1,0
1,38500,4000,2,1,1,1,0,0,0,0,0,0
2,49500,3060,3,1,1,1,0,0,0,0,0,0
3,60500,6650,3,1,2,1,1,0,0,0,0,0
4,61000,6360,2,1,1,1,0,0,0,0,0,0


In [5]:
df.shape

(546, 12)

# Automatic EDA using Pandas Profiling

In [7]:
!pip install pandas-profiling



In [8]:
import pandas_profiling as pp

In [9]:
profile = pp.ProfileReport(df)
profile.to_file("output.html")

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




# Checking For Null Values

In [6]:
df.isnull().sum()  

price        0
area         0
rooms        0
bathroom     0
floors       0
driveway     0
game_room    0
cellar       0
gas          0
air          0
garage       0
situation    0
dtype: int64

# Checking Data Types

In [7]:
df.dtypes

price        int64
area         int64
rooms        int64
bathroom     int64
floors       int64
driveway     int64
game_room    int64
cellar       int64
gas          int64
air          int64
garage       int64
situation    int64
dtype: object

# Outlier Analysis

In [12]:
df.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

Unnamed: 0,price,area,rooms,bathroom,floors,driveway,game_room,cellar,gas,air,garage,situation
count,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0
mean,68121.59707,5150.265568,2.965201,1.285714,1.807692,0.858974,0.177656,0.349817,0.045788,0.31685,0.692308,0.234432
std,26702.670926,2168.158725,0.737388,0.502158,0.868203,0.348367,0.382573,0.477349,0.209216,0.465675,0.861307,0.424032
min,25000.0,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,26725.0,1972.5,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,35000.0,2565.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10%,40500.0,3000.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49125.0,3600.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62000.0,4600.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,82000.0,6360.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [13]:
def clipping(x):
    x=x.clip(lower=x.quantile(0.01),upper=x.quantile(0.99))
    return x

In [14]:
df=df.apply(lambda x : clipping(x))          #Outlier Analysis

In [15]:
df.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

Unnamed: 0,price,area,rooms,bathroom,floors,driveway,game_room,cellar,gas,air,garage,situation
count,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0
mean,67898.346154,5135.637363,2.965201,1.283883,1.807692,0.858974,0.177656,0.349817,0.045788,0.31685,0.692308,0.234432
std,25811.063006,2099.31104,0.717205,0.493997,0.868203,0.348367,0.382573,0.477349,0.209216,0.465675,0.861307,0.424032
min,26725.0,1972.5,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,26848.75,1984.875,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,35000.0,2565.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10%,40500.0,3000.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49125.0,3600.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62000.0,4600.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,82000.0,6360.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


# Removing Zero Variance Columns

In [16]:
from sklearn.feature_selection import VarianceThreshold

var = VarianceThreshold(threshold=0)
var.fit_transform(df)
cols = var.get_support(indices=True)
print(cols)

[ 0  1  2  3  4  5  6  7  8  9 10 11]


# Removing Correlated Variables

In [17]:
from feature_engine.selection import SmartCorrelatedSelection
# set up the selector
tr = SmartCorrelatedSelection(
    method="pearson",
    threshold=0.8,
    selection_method="variance",
)

In [18]:
df.shape

(546, 12)

In [19]:
df = tr.fit_transform(df)

In [20]:
df.shape

(546, 12)

# Seperating X & Y

In [21]:
X=df.drop(['price'],axis=1)
Y=df['price']

# Splitting Data and Modelling

In [22]:
# Import the libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

In [24]:
model = LinearRegression()

In [25]:
model.fit(X_train,y_train)

LinearRegression()

# Predictions

In [31]:
price_predicted=model.predict(X_test)

In [33]:
r_square = model.score(X_test,y_test)
r_square

0.6376838540397625

In [34]:
print(model.coef_)   #slope

[3.88253823e+00 2.65143868e+03 1.34912698e+04 6.07753001e+03
 5.11310288e+03 4.92996097e+03 3.41259940e+03 1.45514387e+04
 1.25543902e+04 4.44378582e+03 9.27063071e+03]


In [35]:
print(model.intercept_)  #c

-4010.8107282767305


In [48]:
# Mean Absolute Error (MAE)
# Mean Squared Error (MSE)
# Root Mean Squared Error(RMSE)

from sklearn import metrics
print('MAE :', metrics.mean_absolute_error(y_test, price_predicted))
print('MSE :', metrics.mean_squared_error(y_test, price_predicted))
print('RMSE :', np.sqrt(metrics.mean_squared_error(y_test, price_predicted)))

MAE : 10181.281518279398
MSE : 187631252.73713514
RMSE : 13697.855771511655
