In [21]:
### House price prediction

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import xgboost as xgbHP

In [3]:
londonSalesData = pd.read_csv('London_House_Price_MonthWise.csv')

In [26]:
#load the data
londonSalesData

Unnamed: 0,date,area,average_price,code,houses_sold,no_of_crimes,borough_flag
0,1995-01-01,city of london,91449,E09000001,17.0,,1
1,1995-02-01,city of london,82203,E09000001,7.0,,1
2,1995-03-01,city of london,79121,E09000001,14.0,,1
3,1995-04-01,city of london,77101,E09000001,7.0,,1
4,1995-05-01,city of london,84409,E09000001,10.0,,1
...,...,...,...,...,...,...,...
13544,2019-09-01,england,249942,E92000001,64605.0,,0
13545,2019-10-01,england,249376,E92000001,68677.0,,0
13546,2019-11-01,england,248515,E92000001,67814.0,,0
13547,2019-12-01,england,250410,E92000001,,,0


In [4]:
londonSalesData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13549 entries, 0 to 13548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           13549 non-null  object 
 1   area           13549 non-null  object 
 2   average_price  13549 non-null  int64  
 3   code           13549 non-null  object 
 4   houses_sold    13455 non-null  float64
 5   no_of_crimes   7439 non-null   float64
 6   borough_flag   13549 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 741.1+ KB


In [5]:
def process_input_HP(df_house_price):
    df_house_price = df_house_price.copy()
    
    df_house_price.head(2)
    
    # Drop the redundant column
    df_house_price = df_house_price.drop('code', axis=1)
    
    # Drop column contain missing values
    df_house_price = df_house_price.drop('no_of_crimes', axis=1)
    
    # Drop rows having missing target value
    missing_target_rows = df_house_price[df_house_price['houses_sold'].isna()].index
    df_house_price = df_house_price.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # process date
    df_house_price['date'] = pd.to_datetime(df_house_price['date'])
    df_house_price['year'] = df_house_price['date'].apply(lambda x: x.year)
    df_house_price['month'] = df_house_price['date'].apply(lambda x: x.month)
    df_house_price = df_house_price.drop('date', axis=1)
    
    # #Seperate the columns based on area
    area_dummies = pd.get_dummies(df_house_price['area'], prefix='area')
    df_house_price = pd.concat([df_house_price, area_dummies], axis=1)
    df_house_price = df_house_price.drop('area', axis=1)
    
    # df_house_price will be split into X and y
    y = df_house_price['average_price']
    X = df_house_price.drop('average_price', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = process_input_HP(londonSalesData)

In [7]:
X_train

Unnamed: 0,houses_sold,borough_flag,year,month,area_barking and dagenham,area_barnet,area_bexley,area_brent,area_bromley,area_camden,...,area_south east,area_south west,area_southwark,area_sutton,area_tower hamlets,area_waltham forest,area_wandsworth,area_west midlands,area_westminster,area_yorks and the humber
10752,-0.005908,-1.657173,1.542734,1.315615,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
10236,0.222400,-1.657173,-0.963307,1.024950,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
4512,-0.298115,0.603437,-1.380980,-1.300369,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
9208,-0.293663,0.603437,0.985836,1.024950,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,6.754536,-0.152846,-0.152117,-0.152482,-0.151385
2672,-0.303063,0.603437,1.542734,-0.428374,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,-0.298033,0.603437,-1.659429,0.734285,-0.145406,-0.150649,6.720615,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
5192,-0.281790,0.603437,-0.545633,1.606280,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
12172,0.323815,-1.657173,0.707387,0.443620,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
235,-0.318976,0.603437,0.985836,0.443620,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385


In [8]:
#display y-train
y_train

10752    128885
10236    137190
4512      73211
9208     342013
2672     473140
          ...  
905       64510
5192     203931
12172    196204
235      761544
13349    173811
Name: average_price, Length: 9418, dtype: int64

In [10]:
print(len(X_train))
print(len(y_train))

#print(len(X_test))
#print(len(y_test))


9418
9418


In [11]:
#building the model
predictionModels = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor()
   # "                              LightGBM": LGBMRegressor(),
   # "                              CatBoost": CatBoostRegressor(verbose=0)
}

In [12]:
for name, m in predictionModels.items():
    m.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.


In [13]:
#Get RMSE value for all the models
print("           Root Mean Squared Error(RMSE) ")
print(" ------------------------------------------------------- ")
for name, m in predictionModels.items():
    y_pred = m.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + " RMSE: {:.4f}".format(rmse))

           Root Mean Squared Error(RMSE) 
 ------------------------------------------------------- 
                     Linear Regression RMSE: 72256.6418
 Linear Regression (L2 Regularization) RMSE: 72254.9678
 Linear Regression (L1 Regularization) RMSE: 72255.2946
                   K-Nearest Neighbors RMSE: 17393.5485
                        Neural Network RMSE: 286863.8589
Support Vector Machine (Linear Kernel) RMSE: 315162.8536
   Support Vector Machine (RBF Kernel) RMSE: 190058.0919
                         Decision Tree RMSE: 16212.3469
                         Random Forest RMSE: 11725.3718
                     Gradient Boosting RMSE: 39577.2622
                               XGBoost RMSE: 13540.8596


In [14]:
print('The best model is Random Forest with 11674.9879 RMSE. ')


The best model is Random Forest with 11674.9879 RMSE. 


In [15]:
#find R-Squred value - the square of the correlation
print("                        R-Squared ")
print(" ------------------------------------------------------- ")
for name, m in predictionModels.items():
    print(name + " : R-Squared: {:.4f}".format(m.score(X_test, y_test)))

                        R-Squared 
 ------------------------------------------------------- 
                     Linear Regression : R-Squared: 0.8482
 Linear Regression (L2 Regularization) : R-Squared: 0.8482
 Linear Regression (L1 Regularization) : R-Squared: 0.8482
                   K-Nearest Neighbors : R-Squared: 0.9912
                        Neural Network : R-Squared: -1.3929
Support Vector Machine (Linear Kernel) : R-Squared: -1.8883
   Support Vector Machine (RBF Kernel) : R-Squared: -0.0504
                         Decision Tree : R-Squared: 0.9924
                         Random Forest : R-Squared: 0.9960
                     Gradient Boosting : R-Squared: 0.9545
                               XGBoost : R-Squared: 0.9947


In [16]:
print('how best is the model? 99 percent accuracy when predicting the house price.')

how best is the model? 99 percent accuracy when predicting the house price.
