# MULTIPLE LINEAR REGRESSION WITH SCI-KIT LEARN

Imported necessary Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

Importing data using pandas method to convert it into DataFrame

In [5]:
data = pd.read_csv('real_estate_price_size_year.csv')

Top 5 records in DataFrame

In [7]:
data.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [8]:
data.describe() # Basic Statiscal information on DataFrame Columns.

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


In [9]:
x = data[['size','year']]
y = data['price']

In [10]:
x.shape   # Checking Vector length and dimension before fitting it into SKLearn.

(100, 2)

In [11]:
reg = LinearRegression()  # Creating Object for Linear Regression Class.

In [12]:
reg.fit(x,y)  # Fitted both both variables.

In [13]:
reg.get_params()  # To see fitted parameters.

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [14]:
reg.coef_  # To get Coefficients of "X"

array([ 227.70085401, 2916.78532684])

In [15]:
reg.intercept_  # To get constant

-5772267.017463279

$ R^2 = $

In [17]:
reg.score(x,y) # to get R squared

0.7764803683276795

Equation for Adjusted R squared :-- $ R^2_{adj.} = 1 - (1 - R^2)*\frac{n-1}{n-p-1} $

In [19]:
x.shape

(100, 2)

In [20]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]

In [21]:
adj_R2 = 1 - ( 1 - r2 ) * (n - 1)/(n - p - 1)  # putting r2, n and p values into equation to get adjusted R2 manually.
adj_R2

0.7718717161282502

In [22]:
from sklearn.feature_selection import f_regression  # Importing f_regression to get p-value.

In [23]:
f_regression(x, y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [24]:
p_values = f_regression(x, y)[1]
p_values

array([8.12763222e-31, 3.57340758e-01])

In [25]:
p_values.round(3)

array([0.   , 0.357])

Making Summary with Coefficients, p-values for their respective features, p-values here will tell us how much effective the feature is.


In [27]:
reg_summary = pd.DataFrame(data = x.columns.values, columns = ['Features']) # Features aka Independent variables
reg_summary

Unnamed: 0,Features
0,size
1,year


In [28]:
reg_summary['Weights'] = reg.coef_  # Coefficients aka Weights in ML
reg_summary['p_values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Weights,p_values
0,size,227.700854,0.0
1,year,2916.785327,0.357


p-values above under 0.5 so, both features aka independent variables are efficient in this regression.
But when using SK learn penalized the unefficient feature by reducing their weight one of reason SK learn default does not have p-values.

---------------------------------------------------------------------------------------------------------------------------------------------------------

Creating regression model with standardize features to remove unequal magnitude of the data in feautes.

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler() # Creating object for StandardScaler class.

In [34]:
scaler.fit(x)  # Fitting the features into scaler object/Instance.

In [35]:
scaler.get_params()  # To see Parameters in fitted Scaler.

{'copy': True, 'with_mean': True, 'with_std': True}

In [36]:
x_scaled = scaler.transform(x) # Standardizing features using Transform method.
x_scaled  # Scaled Features

array([[-0.70816415,  0.51006137],
       [-0.66387316, -0.76509206],
       [-1.23371919,  1.14763808],
       [ 2.19844528,  0.51006137],
       [ 1.42498884, -0.76509206],
       [-0.937209  , -1.40266877],
       [-0.95171405,  0.51006137],
       [-0.78328682, -1.40266877],
       [-0.57603328,  1.14763808],
       [-0.53467702, -0.76509206],
       [ 0.69939906, -0.76509206],
       [ 3.33780001, -0.76509206],
       [-0.53467702,  0.51006137],
       [ 0.52699137,  1.14763808],
       [ 1.51100715, -1.40266877],
       [ 1.77668568, -1.40266877],
       [-0.54810263,  1.14763808],
       [-0.77276222, -1.40266877],
       [-0.58004747, -1.40266877],
       [ 0.58943055,  1.14763808],
       [-0.78365788,  0.51006137],
       [-1.02322731,  0.51006137],
       [ 1.19557293,  0.51006137],
       [-1.12884431,  0.51006137],
       [-1.10378093, -0.76509206],
       [ 0.84424715,  1.14763808],
       [-0.95171405,  1.14763808],
       [ 1.62279723,  0.51006137],
       [-0.58004747,

In [37]:
reg = LinearRegression()

In [38]:
reg.fit(x_scaled,y) # Fitted standardized/normalized features into new regression model

In [39]:
reg.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [40]:
reg.coef_

array([67501.57614152, 13724.39708231])

In [41]:
reg.intercept_

292289.4701599997

Making new Summary for standardized data to see effectiveness of the features applied into regression

In [43]:
reg_summary = pd.DataFrame([['Bias'],['size'],['year']], columns = ['Features'])
reg_summary['Weights'] = reg.intercept_ , reg.coef_[0] , reg.coef_[1]
reg_summary  # Bias here Intercept/Constant and Weights are showing Coefficients

Unnamed: 0,Features,Weights
0,Bias,292289.47016
1,size,67501.576142
2,year,13724.397082


Creating new data to see how the model will perfrom in prediton

In [45]:
new_data = pd.DataFrame([[750,2003],[1300,2007]], columns = ['size','year'])
new_data

Unnamed: 0,size,year
0,750,2003
1,1300,2007


Before making prediction we must scaled the new data because algorithm trained only on it.

In [47]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-0.34752816, -2.04024548],
       [ 1.50776881, -1.1901432 ]])

In [48]:
reg.predict(new_data_scaled).round(3) # Predicted prizes on the given new data

array([240829.633, 377732.244])

Making a simple linear regression taking only "size" in consideration and will check how it will perform on new data

In [50]:
reg_simple = LinearRegression()
reg_simple_matrix = x_scaled[:,0].reshape(-1,1)

In [51]:
reg_simple.fit(reg_simple_matrix, y)

In [52]:
reg_simple.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [53]:
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1)).round(3)

array([269296.659, 392044.967])