In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [3]:
#  Load data set 
df  = pd.read_csv('minihomeprices.csv')

# show first five rows
df.head()

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [4]:
# show information about data set

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      6 non-null      int64  
 1   bedrooms  5 non-null      float64
 2   age       6 non-null      int64  
 3   price     6 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 320.0 bytes


In [5]:
#  to know how many null values
df.isna().sum()

area        0
bedrooms    1
age         0
price       0
dtype: int64

* we can see here only bedrooms has null values

In [6]:
# fill null values with median value

df['bedrooms'] = df['bedrooms'].fillna( df['bedrooms'].mean() )

#  here we can use inplace=True as well.  both are valid for update data frame

df.head()

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.2,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [7]:
# import model 
from sklearn.linear_model import LinearRegression

In [8]:
mdl = LinearRegression()

In [9]:
#  Set dependent and independent variables

X = df.drop(['price'], axis=1)
y = df['price']

In [10]:
# Change bedrooms data type flaot to int

df['bedrooms'] = df['bedrooms'].astype('int64')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   area      6 non-null      int64
 1   bedrooms  6 non-null      int64
 2   age       6 non-null      int64
 3   price     6 non-null      int64
dtypes: int64(4)
memory usage: 320.0 bytes


In [11]:
#  shows the variables
print(X)
print("-" * 25)
print(y)

   area  bedrooms  age
0  2600       3.0   20
1  3000       4.0   15
2  3200       4.2   18
3  3600       3.0   30
4  4000       5.0    8
5  4100       6.0    8
-------------------------
0    550000
1    565000
2    610000
3    595000
4    760000
5    810000
Name: price, dtype: int64


In [12]:
#  Fitting Model

mdl.fit( X, y  )

LinearRegression()

In [13]:
# Now custimize prediction testing

mdl.predict([[ 4000, 2, 50 ]])
# show house price here

array([551989.04870256])

In [14]:
mdl.coef_

array([  116.66950551, 18756.28806982, -3675.75111708])

In [15]:
mdl.intercept_

231586.00639409182

In [16]:
#  know score 
score = mdl.score( X, y )

print(score * 100)

95.40926625396438
