In [26]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.core.common import random_state
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('homeprices.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      20 non-null     int64  
 1   bedrooms  17 non-null     float64
 2   age       20 non-null     int64  
 3   price     20 non-null     int64  
dtypes: float64(1), int64(3)
memory usage: 772.0 bytes


In [5]:
df.head()

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,,30,595000
4,4000,5.0,8,760000


In [6]:
df.describe()

Unnamed: 0,area,bedrooms,age,price
count,20.0,17.0,20.0,20.0
mean,4875.0,5.058824,15.85,929500.0
std,1236.665721,1.028992,7.589986,247508.1
min,2600.0,3.0,5.0,550000.0
25%,4075.0,4.0,9.75,775000.0
50%,4900.0,5.0,15.0,920000.0
75%,5850.0,6.0,20.0,1112500.0
max,6800.0,7.0,30.0,1350000.0


In [7]:
df.columns

Index(['area', 'bedrooms', 'age', 'price'], dtype='object')

### fill null values

In [8]:
df.bedrooms.median()

5.0

In [9]:
df.bedrooms = df.bedrooms.fillna(df.bedrooms.median())
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,5.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000
6,4200,4.0,12,780000
7,4400,5.0,10,830000
8,4600,5.0,25,860000
9,4800,4.0,7,900000


In [10]:
features = df.drop('price',axis='columns')
features

Unnamed: 0,area,bedrooms,age
0,2600,3.0,20
1,3000,4.0,15
2,3200,4.0,18
3,3600,5.0,30
4,4000,5.0,8
5,4100,6.0,8
6,4200,4.0,12
7,4400,5.0,10
8,4600,5.0,25
9,4800,4.0,7


In [15]:
target = df.price
target

0      550000
1      565000
2      610000
3      595000
4      760000
5      810000
6      780000
7      830000
8      860000
9      900000
10     940000
11     970000
12    1020000
13    1050000
14    1100000
15    1150000
16    1200000
17    1250000
18    1300000
19    1350000
Name: price, dtype: int64

In [16]:
import plotly.express as px
import plotly.graph_objects as go
fig = px.scatter_3d(features, x=features.area,y=features.bedrooms,z=target)
fig.show()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)

In [19]:
reg = linear_model.LinearRegression()
reg.fit(features,df.price)

In [20]:
print(f'Coefficient: {reg.coef_}')
print(f'Intercept: {reg.intercept_}')

Coefficient: [  201.72273056 -6609.61788557 -1339.28880018]
Intercept: 707.9863290111534


In [21]:
y_pred_test = reg.predict(X_test)     # predicted value of y_test
y_pred_train = reg.predict(X_train)

In [28]:
print('MSE = ', mean_squared_error(y_test, y_pred_test ))
print('R2 = ', r2_score(y_test, y_pred_test ))
print('RMSE = ', np.sqrt(mean_squared_error(y_test, y_pred_test )))

MSE =  722479363.6301377
R2 =  0.9931179127997486
RMSE =  26878.976238505395
