In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
df = pd.read_csv("RealEstate.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0_level_0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


# Split to Dependent and Independent Variables
For a typical problem in Machine Learning, we will have something we need to predict given a set of other values. The something we want to predict is called a dependent variable, while the set of values given are the independent variables. In this case, Y house price of unit area is the dependent variable. Others are regarded as independent ones.

In [4]:
# Let us list the column names, so that we can easily select the dependent and independent variables
df.columns

Index(['X1 transaction date', 'X2 house age',
       'X3 distance to the nearest MRT station',
       'X4 number of convenience stores', 'X5 latitude', 'X6 longitude',
       'Y house price of unit area'],
      dtype='object')

In [5]:
# Stores the label for the independent variable
y_label = ['Y house price of unit area']

In [6]:
# Stores the labels for the 'other' dependent variables
x_labels = ['X1 transaction date', 'X2 house age',
       'X3 distance to the nearest MRT station',
       'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']

In [7]:
# Now the data is still in a dataframe format
df[x_labels]

Unnamed: 0_level_0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2012.917,32.0,84.87882,10,24.98298,121.54024
2,2012.917,19.5,306.59470,9,24.98034,121.53951
3,2013.583,13.3,561.98450,5,24.98746,121.54391
4,2013.500,13.3,561.98450,5,24.98746,121.54391
5,2012.833,5.0,390.56840,5,24.97937,121.54245
...,...,...,...,...,...,...
410,2013.000,13.7,4082.01500,0,24.94155,121.50381
411,2012.667,5.6,90.45606,9,24.97433,121.54310
412,2013.250,18.8,390.96960,7,24.97923,121.53986
413,2013.000,8.1,104.81010,5,24.96674,121.54067


In [8]:
# Let us convert them into matrices
X = np.asarray(df[x_labels])
y = np.asarray(df[y_label])

# Normalize the Data
Normalizing is an important step in machine learning. Its function is to make the variables smaller in value. If the values we input to the machine learning model has a range that is too big, then it will not *converge*, or give a proper result. Here we will normalize all the dependent data.

In [9]:
# Normalize the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X)
X = scaler.transform(X)

# Train and Test split
The data will be split into train and test because we need the data to train the model, but also spare some to test the model's validity and accuracy.

In [10]:
# Split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [11]:
X_train

array([[1.        , 0.15068493, 0.0103754 , 0.9       , 0.51211827,
        0.75016174],
       [0.90938865, 0.0913242 , 0.32855561, 0.3       , 0.37469704,
        0.42387319],
       [0.81877729, 0.30136986, 0.26130607, 0.2       , 0.38839069,
        0.46549493],
       ...,
       [0.        , 0.84703196, 0.1384846 , 0.1       , 0.48364033,
        0.83135648],
       [0.09061135, 0.29452055, 0.03515249, 0.7       , 0.41190015,
        0.74875997],
       [0.54585153, 0.2739726 , 0.20677973, 0.1       , 0.24200194,
        0.80752642]])

In [12]:
y_train[0:5]

array([[58.1],
       [33.4],
       [30.8],
       [21.7],
       [20.9]])

# Multiple Linear Regression Model

Multiple linear regression (MLR), also known simply as multiple regression, is a statistical technique that uses several (multiple) variables to predict the outcome of a (one) response variable. The `scikit-learn` library uses a linear regression algorithm called the OLS.

In [13]:
# Import the machine learning library Scikit-Learn
from sklearn import linear_model

regr = linear_model.LinearRegression()

In [14]:
# Fit the model to the training data
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
# Linear regression coefficients
print(f"Regression coefficients [x1 x2 x3 x4 x5 x6]: {regr.coef_}")

Regression coefficients [x1 x2 x3 x4 x5 x6]: [[  5.19752289 -11.42703184 -29.83092874  10.82105149  19.13379372
   -1.8207899 ]]


In [16]:
# Use the model to predict the result
prediction = regr.predict(X_test)
print(prediction[0:5])
print(y_test[0:5])

[[34.6920798 ]
 [47.30118939]
 [21.92669234]
 [12.62287202]
 [42.69787196]]
[[27.3]
 [54.4]
 [22. ]
 [11.6]
 [45.4]]


In [17]:
# Compare the values to get the score
score = regr.score(X_test, y_test)
print("The accuracy of the prediction is", score)

The accuracy of the prediction is 0.754459684843172


In [18]:
x_input = np.array([])
for x in x_labels:
    x_input = np.append(x_input, input(x + ": "))
    
x_input = x_input.reshape(1,-1)

x_input = scaler.transform(x_input)
print(x_input)

print("Your dream House price is", regr.predict(x_input))

X1 transaction date:  2012
X2 house age:  4
X3 distance to the nearest MRT station:  340
X4 number of convenience stores:  5
X5 latitude:  20
X6 longitude:  182


[[-7.28165939e-01  9.13242009e-02  4.89767799e-02  5.00000000e-01
  -5.97681774e+01  6.52646862e+02]]
Your dream House price is [[-2299.76059302]]


# SVM
This is another kind of a machine learning model. [Link to Wiki](https://en.wikipedia.org/wiki/Support_vector_machine#:~:text=In%20machine%20learning%2C%20support%2Dvector,for%20classification%20and%20regression%20analysis)

In [19]:
from sklearn import svm

regr = svm.SVR(gamma='scale', kernel='rbf')

In [20]:
regr.fit(X_train, y_train.ravel())

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

print(f"Regression coefficients [x1 x2 x3 x4 x5 x6]: {regr.coef_}")

In [21]:
# Use the model to predict the result
prediction = regr.predict(X_test)
print(prediction[0:5])
print(y_test[0:5])

[29.87875409 47.25816185 26.1199992  20.38545237 39.37594551]
[[27.3]
 [54.4]
 [22. ]
 [11.6]
 [45.4]]


In [22]:
# Compare the values to get the score
score = regr.score(X_test, y_test)
print("The accuracy of the prediction is", score)

The accuracy of the prediction is 0.8099065985687073


# Random Forest Regressor
This is also another kind of a learning model. [Link to wiki](https://en.wikipedia.org/wiki/Random_forest)

In [23]:
from sklearn import ensemble

regr = ensemble.RandomForestRegressor(n_estimators=100)

In [24]:
regr.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

print(f"Regression coefficients [x1 x2 x3 x4 x5 x6]: {regr.coef_}")

In [25]:
# Use the model to predict the result
prediction = regr.predict(X_test)
print(prediction[0:5])
print(y_test[0:5])

[25.179      48.8805     23.906      13.804      58.59908333]
[[27.3]
 [54.4]
 [22. ]
 [11.6]
 [45.4]]


In [26]:
# Compare the values to get the score
score = regr.score(X_test, y_test)
print("The accuracy of the prediction is", score)

The accuracy of the prediction is 0.7944296990283074
