## Multivariate Linear Regression

In [88]:
# Import the libraries 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk


In [95]:
#import the data set

df = pd.read_csv('Real estate.csv',names=['no','trans_date','house_age','distance from mrt','# of convenience_stores','latitude','longitude','house_price'],header=0)
df.head()

Unnamed: 0,no,trans_date,house_age,distance from mrt,# of convenience_stores,latitude,longitude,house_price
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [96]:
#Extrating the required fields 
dataset = df[['house_age','distance from mrt','# of convenience_stores','latitude','longitude','house_price']]
dataset.head()

Unnamed: 0,house_age,distance from mrt,# of convenience_stores,latitude,longitude,house_price
0,32.0,84.87882,10,24.98298,121.54024,37.9
1,19.5,306.5947,9,24.98034,121.53951,42.2
2,13.3,561.9845,5,24.98746,121.54391,47.3
3,13.3,561.9845,5,24.98746,121.54391,54.8
4,5.0,390.5684,5,24.97937,121.54245,43.1


In [None]:
# EDA 



In [97]:
# Colinearity check on the features 

corr_matrix = dataset.corr().round(2)
corr_matrix
#sns.heatmap(data= corr_matrix, annot=True)

Unnamed: 0,house_age,distance from mrt,# of convenience_stores,latitude,longitude,house_price
house_age,1.0,0.03,0.05,0.05,-0.05,-0.21
distance from mrt,0.03,1.0,-0.6,-0.59,-0.81,-0.67
# of convenience_stores,0.05,-0.6,1.0,0.44,0.45,0.57
latitude,0.05,-0.59,0.44,1.0,0.41,0.55
longitude,-0.05,-0.81,0.45,0.41,1.0,0.52
house_price,-0.21,-0.67,0.57,0.55,0.52,1.0


In [None]:
# pairplot to check spread of each feature 


In [98]:
#splitting the data set into feature n target variable

x= dataset.iloc[:,:-1]
y= dataset.iloc[:,-1]

# splitting the data into train test 
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=100)
x_train.shape

(289, 5)

In [99]:
# Applying linear equation on full model 
from sklearn.linear_model import LinearRegression
lnr = LinearRegression()
lnr.fit(x_train,y_train)

#print the coefficient and intercept 

print(" Intercept ",lnr.intercept_)
print(" Co-eficient ",lnr.coef_)

# predict the result with test data

y_predict = lnr.predict(x_test)
#print(y_predict)
# this need to be seen n evaluated 
lnr.score(x_test,y_test)

 Intercept  -641.4752573542916
 Co-eficient  [-2.81980885e-01 -4.70761718e-03  1.08604498e+00  2.12564677e+02
 -3.80314555e+01]


0.659147396830877

In [100]:
# validity of the model by r square 
import sklearn.metrics as sk_metrics
from sklearn.metrics import r2_score

r2_score = r2_score(y_test,y_predict)
print(" the r square value for the full model = ",r2_score)

# validity of the model by rmse method 
rmse_score = np.sqrt(sk_metrics.mean_squared_error(y_test,y_predict))
print("The RMSE value for the full model =  ",rmse_score)

#calculate the adjusted r square 
adj_r2 = 1-(1-r2_score)*((len(x_test)-1)/(len(y_test)-len(x_test.columns)-1))
print("The adj RMSE value for the full model =  ",adj_r2)


 the r square value for the full model =  0.659147396830877
The RMSE value for the full model =   6.911892612856426
The adj RMSE value for the full model =   0.6448258588825945


From the evaluation of the model it is found that taking into cosideration all the features explains 65% of the variability of the target variable price
and the RMSE value is also not close to 0. 

# Hence we have to tune the ML model. 

# 1. Find multicolinearity between the features and eliminate redundant feature.
# 2. Use Ridge Linear Regression 
# 3. Lasso linear Regression 
# 4. Stepwise Linear Regression 

In [103]:
# Feature engeering to find relevant feature for the model
# Here we will use the OSL Algo from the statsmodel library to implement Linear Regression

import statsmodels.api as sm

x = sm.add_constant(x);
osl_model = sm.OLS(y,x).fit()
osl_model.summary()

0,1,2,3
Dep. Variable:,house_price,R-squared:,0.571
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,108.7
Date:,"Tue, 25 May 2021",Prob (F-statistic):,9.34e-73
Time:,13:09:10,Log-Likelihood:,-1492.4
No. Observations:,414,AIC:,2997.0
Df Residuals:,408,BIC:,3021.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4945.5951,6211.157,-0.796,0.426,-1.72e+04,7264.269
house_age,-0.2689,0.039,-6.896,0.000,-0.346,-0.192
distance from mrt,-0.0043,0.001,-5.888,0.000,-0.006,-0.003
# of convenience_stores,1.1630,0.190,6.114,0.000,0.789,1.537
latitude,237.7672,44.948,5.290,0.000,149.409,326.126
longitude,-7.8055,49.149,-0.159,0.874,-104.422,88.811

0,1,2,3
Omnibus:,240.068,Durbin-Watson:,2.149
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3748.747
Skew:,2.129,Prob(JB):,0.0
Kurtosis:,17.114,Cond. No.,23500000.0


In [104]:
# droping the column with p value more than 0.05 here its longtitude
del x['longitude']

In [105]:
# By OLS method 
osl_model = sm.OLS(y,x).fit()

osl_model.summary()

0,1,2,3
Dep. Variable:,house_price,R-squared:,0.571
Model:,OLS,Adj. R-squared:,0.567
Method:,Least Squares,F-statistic:,136.2
Date:,"Tue, 25 May 2021",Prob (F-statistic):,7.6e-74
Time:,13:09:24,Log-Likelihood:,-1492.5
No. Observations:,414,AIC:,2995.0
Df Residuals:,409,BIC:,3015.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5916.0065,1112.732,-5.317,0.000,-8103.394,-3728.619
house_age,-0.2687,0.039,-6.903,0.000,-0.345,-0.192
distance from mrt,-0.0042,0.000,-8.473,0.000,-0.005,-0.003
# of convenience_stores,1.1648,0.190,6.141,0.000,0.792,1.538
latitude,238.6357,44.561,5.355,0.000,151.039,326.233

0,1,2,3
Omnibus:,240.761,Durbin-Watson:,2.15
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3777.481
Skew:,2.136,Prob(JB):,0.0
Kurtosis:,17.168,Cond. No.,4210000.0


In [113]:
# By sklearn linear regression
x= dataset.iloc[:,:-1]
x1 = x[['distance from mrt','# of convenience_stores']]
x1_train,x1_test,y_train,y_test = train_test_split(x1,y,test_size=0.2,random_state=100)

lnr.fit(x1_train,y_train)

#print the coefficient and intercept 

print(" Intercept ",lnr.intercept_)
print(" Co-eficient ",lnr.coef_)

# predict the result with test data

y1_predict = lnr.predict(x1_test)
#print(y1_predict)
# this need to be seen n evaluated 
lnr.score(x1_test,y_test)

 Intercept  39.79028153385126
 Co-eficient  [-0.00564355  1.04523382]


0.5988216107013469

In [None]:
## Ridge Regression

