In [48]:
#Import libraries
import pandas as pd
import numpy as np
from numpy import math

from statsmodels.formula.api import ols
from scipy import stats
import statsmodels.api as sm

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt

In [49]:
#Assign the settings of row and column for the csv file
pd.set_option('display.max_columns', 27)  # or 1000
pd.set_option('display.max_rows', 1200)  # or 1000

In [50]:
#Load the data
dataLondon = pd.read_csv('London_HP_Final.csv')

In [51]:
#View the data
dataLondon.head()

Unnamed: 0,area,year,median_salary,life_satisfaction,mean_salary,population_size,number_of_jobs,area_size,no_of_houses,average_price,houses_sold,no_of_crimes
0,barking and dagenham,1999,21480.0,,23620.0,162444.0,,,,65320.833333,223.833333,
1,barking and dagenham,2000,22618.0,,24696.0,163893.0,57000.0,,,77549.5,234.416667,
2,barking and dagenham,2001,22323.0,,26050.0,165654.0,54000.0,3780.0,68298.0,88664.0,266.916667,1706.333333
3,barking and dagenham,2002,24813.0,,26653.0,166357.0,52000.0,3780.0,68526.0,112221.916667,287.416667,1670.0
4,barking and dagenham,2003,25358.0,,27792.0,166210.0,55000.0,3780.0,68837.0,142499.0,301.25,1757.083333


In [55]:
#null values will be replaced with zero
dataLondon.replace(np.nan,'0',inplace = True)

In [56]:
dataLondon['houses_sold']=dataLondon['houses_sold'].astype('int64')
dataLondon['no_of_crimes']=dataLondon['no_of_crimes'].astype('int64')
dataLondon['mean_salary']=dataLondon['mean_salary'].astype('float64')
dataLondon['number_of_jobs']=dataLondon['number_of_jobs'].astype('int64')
dataLondon['median_salary']=dataLondon['median_salary'].astype('float64')

In [54]:
dataLondon

Unnamed: 0,area,year,median_salary,life_satisfaction,mean_salary,population_size,number_of_jobs,area_size,no_of_houses,average_price,houses_sold,no_of_crimes
0,barking and dagenham,1999,21480.0,0.0,23620.0,162444.0,0,0.0,0.0,65320.83,223,0
1,barking and dagenham,2000,22618.0,0.0,24696.0,163893.0,57000,0.0,0.0,77549.5,234,0
2,barking and dagenham,2001,22323.0,0.0,26050.0,165654.0,54000,3780.0,68298.0,88664.0,266,1706
3,barking and dagenham,2002,24813.0,0.0,26653.0,166357.0,52000,3780.0,68526.0,112221.9,287,1670
4,barking and dagenham,2003,25358.0,0.0,27792.0,166210.0,55000,3780.0,68837.0,142499.0,301,1757
5,barking and dagenham,2004,26089.0,0.0,29088.0,165610.0,53000,3780.0,68899.0,158176.0,301,1772
6,barking and dagenham,2005,26680.0,0.0,28728.0,166275.0,53000,3780.0,69261.0,163360.9,240,1726
7,barking and dagenham,2006,26549.0,0.0,28660.0,167157.0,51000,3780.0,69529.0,167853.2,304,1802
8,barking and dagenham,2007,30200.0,0.0,31994.0,169031.0,52000,3780.0,69835.0,184909.7,332,1637
9,barking and dagenham,2008,29396.0,0.0,31795.0,172452.0,53000,3780.0,70551.0,187356.8,140,1627


In [57]:
dataLondon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693 entries, 0 to 692
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   area               693 non-null    object 
 1   year               693 non-null    int64  
 2   median_salary      693 non-null    float64
 3   life_satisfaction  693 non-null    object 
 4   mean_salary        693 non-null    float64
 5   population_size    693 non-null    object 
 6   number_of_jobs     693 non-null    int64  
 7   area_size          693 non-null    object 
 8   no_of_houses       693 non-null    object 
 9   average_price      693 non-null    float64
 10  houses_sold        693 non-null    int64  
 11  no_of_crimes       693 non-null    int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 65.1+ KB


In [58]:
#separate the other attributes from the predicting attribute
independent_variable = (['no_of_crimes','mean_salary', 'population_size','number_of_jobs','area_size',
                         'no_of_houses','houses_sold','no_of_crimes'])
dependent_variable = (['average_price'])

#Define X and Y
X=dataLondon[independent_variable].values #x axis - independent varuable
Y=dataLondon[dependent_variable].values # y axis - dependent varuable

In [59]:
##Split the dataset in training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

In [60]:
# importing module  for multiple linear regression
from sklearn.linear_model import LinearRegression

# create an object of LinearRegression class
LR = LinearRegression()

# fitting the training data using the fit method
LR.fit(x_train,y_train)

LinearRegression()

In [61]:
#Assign prediction values in th y axis
y_prediction =  LR.predict(x_test)
y_prediction

array([[374908.19527295],
       [723160.37357032],
       [376071.95093389],
       [438195.55791125],
       [201145.12231958],
       [319366.20638527],
       [399870.50083376],
       [617192.91839176],
       [448207.52379743],
       [303888.37401484],
       [357615.20883626],
       [280167.07191633],
       [226518.64443077],
       [340333.04035425],
       [289123.86570227],
       [525127.13630121],
       [241001.44980414],
       [485130.03252118],
       [353235.33118789],
       [517953.50149251],
       [429486.59608795],
       [337649.51553157],
       [306145.69631688],
       [264425.43518151],
       [324426.87394244],
       [228288.80731074],
       [478039.80381684],
       [334872.04768359],
       [198841.95425925],
       [122275.15106122],
       [360712.49030564],
       [268581.27838483],
       [271105.99736946],
       [380500.61036017],
       [434644.17486244],
       [327396.59760841],
       [360078.12369006],
       [370516.82744559],
       [ 481

In [62]:
x_test

array([[2755, 43350.0, 314242.0, ..., 133598.0, 411, 2755],
       [0, 80655.0, 7654.0, ..., 6313.0, 18, 0],
       [956, 29407.0, 193630.0, ..., 80283.0, 213, 956],
       ...,
       [2198, 35481.0, 163827.0, ..., 83819.0, 366, 2198],
       [1082, 31404.0, 247130.0, ..., 88414.0, 258, 1082],
       [0, 18786.0, 225712.0, ..., '0', 361, 0]], dtype=object)

In [63]:
y_test

array([[ 362965.66666667],
       [ 849790.16666667],
       [ 238842.16666667],
       [ 312539.58333333],
       [ 136624.83333333],
       [ 299103.        ],
       [ 372669.        ],
       [ 696926.08333333],
       [ 466485.58333333],
       [ 274824.91666667],
       [ 173733.66666667],
       [ 118843.41666667],
       [ 219541.66666667],
       [ 264667.        ],
       [ 143764.08333333],
       [ 807119.        ],
       [  77549.5       ],
       [1009044.83333333],
       [ 291132.16666667],
       [1344539.75      ],
       [ 339622.41666667],
       [ 178822.58333333],
       [ 364072.25      ],
       [ 410266.        ],
       [ 314934.25      ],
       [ 145293.66666667],
       [ 495766.25      ],
       [ 298964.41666667],
       [ 172147.16666667],
       [  92887.33333333],
       [ 289668.16666667],
       [ 226469.        ],
       [ 203284.75      ],
       [ 326468.25      ],
       [ 267337.16666667],
       [ 318697.75      ],
       [ 262945.58333333],
 

In [64]:
len(x_train)

485

In [65]:
len(y_train)

485

In [57]:
len(x_test)

208

In [66]:
len(y_test)

208

In [67]:
len(X)

693

In [68]:
len(Y)

693

In [69]:
len(y_prediction)

208

In [70]:
y_prediction

array([[374908.19527295],
       [723160.37357032],
       [376071.95093389],
       [438195.55791125],
       [201145.12231958],
       [319366.20638527],
       [399870.50083376],
       [617192.91839176],
       [448207.52379743],
       [303888.37401484],
       [357615.20883626],
       [280167.07191633],
       [226518.64443077],
       [340333.04035425],
       [289123.86570227],
       [525127.13630121],
       [241001.44980414],
       [485130.03252118],
       [353235.33118789],
       [517953.50149251],
       [429486.59608795],
       [337649.51553157],
       [306145.69631688],
       [264425.43518151],
       [324426.87394244],
       [228288.80731074],
       [478039.80381684],
       [334872.04768359],
       [198841.95425925],
       [122275.15106122],
       [360712.49030564],
       [268581.27838483],
       [271105.99736946],
       [380500.61036017],
       [434644.17486244],
       [327396.59760841],
       [360078.12369006],
       [370516.82744559],
       [ 481

In [71]:
len(y_prediction)

208

In [72]:
#convert to numeric type
dataLondon['houses_sold']=dataLondon['houses_sold'].astype('int64')
dataLondon['no_of_crimes']=dataLondon['no_of_crimes'].astype('int64')
dataLondon['mean_salary']=dataLondon['mean_salary'].astype('float64')
dataLondon['number_of_jobs']=dataLondon['number_of_jobs'].astype('int64')
dataLondon['median_salary']=dataLondon['median_salary'].astype('float64')
dataLondon['population_size']=dataLondon['population_size'].astype('int64')
dataLondon['no_of_houses']=dataLondon['no_of_houses'].astype('int64')
dataLondon['area_size']=dataLondon['area_size'].astype('int64')
dataLondon['life_satisfaction']=dataLondon['life_satisfaction'].astype('int64')

In [73]:
# importing r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# predicting the accuracy score
score=r2_score(y_test,y_prediction)
print('r2 score is (House Price) : ',score)
print('mean_sqrd_error is==',mean_squared_error(y_test,y_prediction))
print('root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_prediction)))

r2 score is (House Price) :  0.4174324285168557
mean_sqrd_error is== 22169571464.775547
root_mean_squared error of is== 148894.49776528194


In [74]:
dataLondon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693 entries, 0 to 692
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   area               693 non-null    object 
 1   year               693 non-null    int64  
 2   median_salary      693 non-null    float64
 3   life_satisfaction  693 non-null    int64  
 4   mean_salary        693 non-null    float64
 5   population_size    693 non-null    int64  
 6   number_of_jobs     693 non-null    int64  
 7   area_size          693 non-null    int64  
 8   no_of_houses       693 non-null    int64  
 9   average_price      693 non-null    float64
 10  houses_sold        693 non-null    int64  
 11  no_of_crimes       693 non-null    int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 65.1+ KB


In [75]:
#import library for finding the summary of the regression
import statsmodels.api as sm
reg2=sm.OLS(dataLondon["average_price"], sm.add_constant(dataLondon[['number_of_jobs','mean_salary','median_salary',
                                                                     'no_of_crimes','life_satisfaction','population_size']])).fit()

#print the regression summary
print(reg2.summary())

                            OLS Regression Results                            
Dep. Variable:          average_price   R-squared:                       0.514
Model:                            OLS   Adj. R-squared:                  0.510
Method:                 Least Squares   F-statistic:                     120.9
Date:                Tue, 20 Sep 2022   Prob (F-statistic):          5.20e-104
Time:                        10:27:36   Log-Likelihood:                -9169.2
No. Observations:                 693   AIC:                         1.835e+04
Df Residuals:                     686   BIC:                         1.838e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              2.716e+05   3.34e+0