In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#sklearn package for LR:
from sklearn.linear_model import LinearRegression

## Simple LR -

In [2]:
data = pd.read_csv('real_estate_price_size_sklearn.csv')
data.head()

Unnamed: 0,price,size
0,234314.144,643.09
1,228581.528,656.22
2,281626.336,487.29
3,401255.608,1504.75
4,458674.256,1275.46


In [11]:
#input variable -> feature
x = data['size']
#output variable -> target
y = data['price']
x.shape
y.shape

(100,)

In [6]:
#instance of LinearRegression class:
reg = LinearRegression()

In [10]:
#Reshape into a matrix from 1D to 2D
x_matrix = x.values.reshape(100,1)
x_matrix.shape

(100, 1)

In [12]:
reg.fit(x_matrix,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
#Score returns R-squared of a linear regression
reg.score(x_matrix,y)

0.7447391865847586

In [14]:
#result is a ND array with all coefficients
reg.coef_

array([223.17874259])

In [15]:
#float, since this always has single intercept
reg.intercept_

101912.60180122906

In [16]:
reg.predict(750)

array([269296.65874718])

## Multiple LR - 

In [18]:
data_two = pd.read_csv('real_estate_price_size_year.csv')
data_two.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [20]:
x= data_two[['size','year']]
y = data_two['price']

In [21]:
reg = LinearRegression()
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
reg.coef_

array([ 227.70085401, 2916.78532684])

In [24]:
reg.intercept_

-5772267.017463278

In [22]:
r2 = reg.score(x,y)

#no of observations
n=x.shape[0]

#no of predictors
p=x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.77187171612825

In [25]:
from sklearn.feature_selection import f_regression

In [26]:
f_regression(x,y)

p_values = f_regression(x,y)[1]
p_values

p_values.round(3)


array([0.   , 0.357])

In [27]:
reg_summary = pd.DataFrame(data=x.columns.values, columns = ['Features'])
reg_summary

Unnamed: 0,Features
0,size
1,year


In [28]:
reg_summary['Coefficients'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,size,227.700854,0.0
1,year,2916.785327,0.357


In [29]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler()
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [36]:
x_scaled = scaler.transform(x)
x_scaled

array([[-0.70816415,  0.51006137],
       [-0.66387316, -0.76509206],
       [-1.23371919,  1.14763808],
       [ 2.19844528,  0.51006137],
       [ 1.42498884, -0.76509206],
       [-0.937209  , -1.40266877],
       [-0.95171405,  0.51006137],
       [-0.78328682, -1.40266877],
       [-0.57603328,  1.14763808],
       [-0.53467702, -0.76509206],
       [ 0.69939906, -0.76509206],
       [ 3.33780001, -0.76509206],
       [-0.53467702,  0.51006137],
       [ 0.52699137,  1.14763808],
       [ 1.51100715, -1.40266877],
       [ 1.77668568, -1.40266877],
       [-0.54810263,  1.14763808],
       [-0.77276222, -1.40266877],
       [-0.58004747, -1.40266877],
       [ 0.58943055,  1.14763808],
       [-0.78365788,  0.51006137],
       [-1.02322731,  0.51006137],
       [ 1.19557293,  0.51006137],
       [-1.12884431,  0.51006137],
       [-1.10378093, -0.76509206],
       [ 0.84424715,  1.14763808],
       [-0.95171405,  1.14763808],
       [ 1.62279723,  0.51006137],
       [-0.58004747,

In [37]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [40]:
reg_summary = pd.DataFrame([['Bias'],['size'],['year']],columns=['Features'])
#weights is term for coefficient
reg_summary['Weights'] = reg.intercept_ , reg.coef_[0],reg.coef_[1]

In [41]:
reg_summary

Unnamed: 0,Features,Weights
0,Bias,292289.47016
1,size,67501.576142
2,year,13724.397082


In [45]:
new_data = pd.DataFrame(data=[[600,2015],[800,2012]],columns=['size','year'])
new_data

Unnamed: 0,size,year
0,600,2015
1,800,2012


In [46]:
reg.predict(new_data)

array([68447895.27592413, 81907037.3129816 ])

In [47]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-0.85351824,  0.51006137],
       [-0.1788648 , -0.12751534]])

In [48]:
reg.predict(new_data_scaled)

array([241675.92851985, 278465.74334083])