In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression # module for F-statistics and P-value
import seaborn as sns
sns.set()

In [49]:
raw_data = pd.read_csv(r'C:\Users\prakh\Downloads\real_estate_price_size_year.csv')

In [50]:
raw_data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


In [51]:
y = raw_data['price']
x = raw_data[['size','year']]


In [52]:
x.shape

(100, 2)

In [53]:
reg = LinearRegression()

In [54]:
reg.fit(x,y)

LinearRegression()

In [55]:
r2 = reg.score(x,y) #R squared

In [56]:
r2

0.7764803683276793

In [57]:
reg.intercept_ #constant

-5772267.01746328

In [58]:
coef = reg.coef_ #coefficients of 'size' and 'year'

In [59]:
coef 

array([ 227.70085401, 2916.78532684])

In [60]:
coef = coef.tolist()

In [61]:
coef

[227.7008540074764, 2916.785326838038]

In [62]:
reg.predict(x)

array([251487.55831903, 236976.55857112, 224762.12124518, 447688.27618312,
       377978.03540671, 209775.60239039, 235047.55665969, 220165.59235875,
       269156.95675102, 245697.50127961, 328999.58170971, 507095.80467165,
       263198.21324064, 343612.85900293, 375034.05120339, 392967.77046502,
       271042.3198222 , 220876.01902325, 233884.5688127 , 347827.60181061,
       246391.61320635, 230220.29855474, 379992.81228669, 223090.98481576,
       207282.09020001, 365028.12432233, 243797.91264021, 408831.12544674,
       260135.63675424, 445777.86601799, 308857.3520032 , 233884.5688127 ,
       223697.04476541, 224782.80216104, 309068.36244145, 255656.57311692,
       266249.96820132, 208650.76017159, 269890.15350093, 264056.64546025,
       337754.30386831, 269156.95675102, 275000.32418183, 391330.41348571,
       355250.64965125, 264056.64546025, 231732.60790333, 237805.57751871,
       249233.31986436, 245558.22808068, 252491.71908521, 337754.30386831,
       320253.59190728, 2

In [63]:
predictiondf = pd.DataFrame(reg.predict(x))

In [64]:
predictiondf

Unnamed: 0,0
0,251487.558319
1,236976.558571
2,224762.121245
3,447688.276183
4,377978.035407
...,...
95,212744.633688
96,323780.678136
97,421437.208242
98,252671.602760


In [65]:
predictiondf = predictiondf.rename(columns = {0 : 'Predicted price'})

In [66]:
predictiondf

Unnamed: 0,Predicted price
0,251487.558319
1,236976.558571
2,224762.121245
3,447688.276183
4,377978.035407
...,...
95,212744.633688
96,323780.678136
97,421437.208242
98,252671.602760


In [67]:
joined = x.join(predictiondf)

In [68]:
joined

Unnamed: 0,size,year,Predicted price
0,643.09,2015,251487.558319
1,656.22,2009,236976.558571
2,487.29,2018,224762.121245
3,1504.75,2015,447688.276183
4,1275.46,2009,377978.035407
...,...,...,...
95,549.80,2009,212744.633688
96,1037.44,2009,323780.678136
97,1504.75,2006,421437.208242
98,648.29,2015,252671.602760


# formula for Adjusted R^2
 
 $R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$
 

In [69]:
n = x.shape[0]

In [70]:
p = x.shape[1]

In [71]:
r = reg.score(x,y)

In [72]:
adjusted_r2 = 1 - (1 - r) * (n-1)/(n-p-1) #ADJUSTED R SQUARED FORMULA 

In [73]:
adjusted_r2

0.77187171612825

In [74]:
f_regression(x,y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [75]:
f = pd.DataFrame(f_regression(x,y))

In [76]:
f

Unnamed: 0,0,1
0,285.9211,0.855258
1,8.127632000000001e-31,0.357341


In [77]:
f = f.rename(columns = {0 : 'size' , 1 : 'year'}, index = {0 : 'F-statistics', 1: 'P-value' })

In [78]:
f

Unnamed: 0,size,year
F-statistics,285.9211,0.855258
P-value,8.127632000000001e-31,0.357341


In [81]:
f = f.append({'size' : coef[0], 'year' : coef[1]},ignore_index=True)

In [82]:
f

Unnamed: 0,size,year
0,285.9211,0.855258
1,8.127632000000001e-31,0.357341
2,227.7009,2916.785327


In [83]:
f = f.rename(index = {0 : 'F-statistics', 1 : 'P-value', 2 : 'Coefficients'})

In [84]:
f

Unnamed: 0,size,year
F-statistics,285.9211,0.855258
P-value,8.127632000000001e-31,0.357341
Coefficients,227.7009,2916.785327
