In [2]:
# Import the necessary libraries, packages and modules.
import statsmodels.api as sm
import sklearn
from sklearn import datasets
import numpy as np
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Read the salary data file
data = pd.read_csv('house_prices.csv')

# Print the info
print(data.info())
# Print the table
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Rooms     506 non-null    float64
 1   Distance  506 non-null    float64
 2   Value     506 non-null    float64
dtypes: float64(3)
memory usage: 12.0 KB
None


Unnamed: 0,Rooms,Distance,Value
0,6.575,4.09,24.0
1,6.421,4.9671,21.6
2,7.185,4.9671,34.7
3,6.998,6.0622,33.4
4,7.147,6.0622,36.2


In [5]:
# Define the dependent and independent variables
y = data['Value']

X = data[['Rooms', 'Distance']]

In [7]:
# Define the mlr and fit the regression model
mlr = linear_model.LinearRegression()
mlr.fit(X, y) 

# Predict the values with the observations
mlr.predict(X)

array([25.23262311, 24.30597474, 31.03025338, 29.9197274 , 31.23113776,
       24.92052548, 20.99628003, 22.59515685, 17.89792552, 21.43016488,
       24.59312806, 21.29554669, 19.86012857, 20.02480328, 21.19854962,
       18.91052046, 19.79946305, 20.16587486, 15.24036623, 17.62554884,
       16.24441157, 19.82577837, 21.36632302, 18.52848931, 19.65425152,
       16.82067934, 18.81534563, 20.76312523, 24.70679323, 26.17680132,
       17.71571146, 20.84706509, 19.68285587, 17.39216584, 20.85532906,
       19.22540394, 18.42427779, 18.77543693, 19.75391977, 26.04958067,
       29.82538634, 27.7461615 , 22.45651299, 22.82617229, 21.57637181,
       17.86689491, 18.78224174, 21.21771802, 15.7523132 , 17.64542212,
       21.17812468, 22.51593928, 26.00129836, 21.48617409, 20.7648873 ,
       33.41670435, 26.03470634, 29.42393915, 23.26887906, 20.91861579,
       19.42498135, 21.20638654, 25.71803969, 28.7805479 , 32.39778062,
       23.95685233, 19.52974218, 20.27518634, 17.77558538, 20.33

### Check the metrics 
* R squared
* Intercept
* Coefficients

In [12]:
print('R-squared: ', mlr.score(X, y))
print('Intercept: ', mlr.intercept_)
print('Coefficients: ')
list(zip(X, mlr.coef_))

R-squared:  0.4955246476058477
Intercept:  -34.63605017547333
Coefficients: 


[('Rooms', 8.801411828632595), ('Distance', 0.488848536567123)]

In [19]:
# Create variabled and define them as a random value to see what the model would predict
New_rooms = 5.75
New_distance = 10
print('Predicted Value: \n', mlr.predict([[New_rooms, New_distance]]))

Predicted Value: 
 [20.8605532]




In [20]:
# Try it out with some more values
New_rooms = 8.75
New_distance = 10
print('Predicted Value: \n', mlr.predict([[New_rooms, New_distance]]))

Predicted Value: 
 [47.26478869]




### We can also create two subsets to see how the model will work on unknown data

In [32]:
# Split the data in train and test set. Do it in a 80:20 ratio
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.2, random_state = 100)

In [33]:
# Train the model using the statsmodel OLS library

# Fit the model with the added constant
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()

# Set the predicted response vector
y_pred = model.predict(sm.add_constant(X_test))

# Call out a model summary
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                  Value   R-squared:                       0.492
Model:                            OLS   Adj. R-squared:                  0.489
Method:                 Least Squares   F-statistic:                     194.0
Date:                Sat, 23 Jul 2022   Prob (F-statistic):           1.15e-59
Time:                        18:24:42   Log-Likelihood:                -1325.0
No. Observations:                 404   AIC:                             2656.
Df Residuals:                     401   BIC:                             2668.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -33.0083      2.879    -11.466      0.0

In [34]:
mlr = LinearRegression()
mlr.fit(X_train, y_train)
y_pred_mlr = mlr.predict(X_test)
print('Prediction for test set: {}'.format(y_pred_mlr))

Prediction for test set: [32.84170204 24.79758581 20.579617   21.05490173 17.79774956 22.98757205
 25.86832879 23.31890801 23.68545938 22.87509946 23.70730711 16.9176725
 26.25045291 20.95413666 36.78002211 28.47689886 25.11424585 10.84676171
 31.40875594 39.86969287 29.78987503 27.44651286 18.54095161 19.40736987
 17.79028485 19.48178357 27.84521604 23.71129088 17.33936814 19.07334403
 18.10835374 20.03742329 43.2171108  24.26414697 28.17793387 29.88746758
 17.05386352 21.08365176 19.69451596 19.03684817 21.19127377 24.72802032
 19.41257797 19.79310205 21.46520067 25.98696879 21.11959823 15.7339515
 19.20682804 19.23190752 21.74447713 20.9262753  22.66455714 28.31416712
 13.49011791 14.5560643  30.87261834 29.79983541 18.47996362 22.8221721
 22.42561549 26.4742037  16.7407955  32.43112451 21.60544817 23.54688296
 18.90159908 30.42102804 19.2913621  20.54218546 23.86522513 21.31460455
  5.14798281 21.56242825 27.77069462 20.67873765 22.97418644 37.87067641
 17.91009163 23.50629655 35.7

In [35]:
print(mlr.score(X_test, y_test))

0.5042821858062394


### Multicollinearity

In [37]:
x_temp = sm.add_constant(X_train)
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor (x_temp.values, i) for i in range(x_temp.values.shape[1])]
vif['features'] = x_temp.columns
print(vif.round(2))

   VIF Factor  features
0       80.43     const
1        1.05     Rooms
2        1.05  Distance


Since the VIF factor is close to 1 we don't need to worry about multicollinearity

### Predicting values

We need to judge how poor or how good our model is

In [38]:
print('Mean Absolute Error (final): ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (final): ', metrics.mean_squared_error(y_test, y_pred))

Mean Absolute Error (final):  4.570493627532387
Mean Squared Error (final):  47.883530163510606


The smaller the values the better the closer to 0 we would have a perfect model with perfect predictions

Run other  regression models  with other variables and compare the MAE and MSE