In [1]:
import pandas as pd
import numpy as np 

In [2]:
df = pd.read_csv('data/homeprices.csv')


In [3]:
df.head()

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [4]:
df.tail()

Unnamed: 0,area,bedrooms,age,price
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      6 non-null      int64  
 1   bedrooms  5 non-null      float64
 2   age       6 non-null      int64  
 3   price     6 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 324.0 bytes


In [10]:
df.describe()

Unnamed: 0,area,bedrooms,age,price
count,6.0,5.0,6.0,6.0
mean,3416.666667,4.2,16.5,648333.333333
std,587.934237,1.30384,8.288546,109117.673484
min,2600.0,3.0,8.0,550000.0
25%,3050.0,3.0,9.75,572500.0
50%,3400.0,4.0,16.5,602500.0
75%,3900.0,5.0,19.5,722500.0
max,4100.0,6.0,30.0,810000.0


In [16]:
df.columns

Index(['area', 'bedrooms', 'age', 'price'], dtype='object')

In [12]:
df.isnull().sum()

area        0
bedrooms    1
age         0
price       0
dtype: int64

In [17]:
df[('bedrooms')].unique()

array([ 3.,  4., nan,  5.,  6.])

#### Data Pre Processing - Fill NAN Value with Median of column

In [18]:
df['bedrooms'].median()

4.0

In [19]:
df['bedrooms'].mean()

4.2

In [21]:
df['bedrooms'].fillna(df['bedrooms'].median())

0    3.0
1    4.0
2    4.0
3    3.0
4    5.0
5    6.0
Name: bedrooms, dtype: float64

In [22]:
df['bedrooms']

0    3.0
1    4.0
2    NaN
3    3.0
4    5.0
5    6.0
Name: bedrooms, dtype: float64

In [23]:
#making changes in dataset
df['bedrooms'] = df['bedrooms'].fillna(df['bedrooms'].median())
df['bedrooms']

0    3.0
1    4.0
2    4.0
3    3.0
4    5.0
5    6.0
Name: bedrooms, dtype: float64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      6 non-null      int64  
 1   bedrooms  6 non-null      float64
 2   age       6 non-null      int64  
 3   price     6 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 324.0 bytes


In [27]:
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [28]:
x = df.drop(columns=['price'])
x

Unnamed: 0,area,bedrooms,age
0,2600,3.0,20
1,3000,4.0,15
2,3200,4.0,18
3,3600,3.0,30
4,4000,5.0,8
5,4100,6.0,8


In [30]:
y = df['price']
y

0    550000
1    565000
2    610000
3    595000
4    760000
5    810000
Name: price, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [33]:
x_train

Unnamed: 0,area,bedrooms,age
1,3000,4.0,15
3,3600,3.0,30
0,2600,3.0,20
4,4000,5.0,8


In [34]:
x_test

Unnamed: 0,area,bedrooms,age
5,4100,6.0,8
2,3200,4.0,18


In [35]:
y_train

1    565000
3    595000
0    550000
4    760000
Name: price, dtype: int64

In [36]:
y_test

5    810000
2    610000
Name: price, dtype: int64

In [40]:
from sklearn.linear_model import LinearRegression

In [41]:
model = LinearRegression()

In [42]:
model.fit(x_train, y_train)

In [43]:
y_pred = model.predict(x_test)
y_pred

array([608500., 554875.])

In [46]:
pd.DataFrame({'y_test':y_test, 'y_predict':y_pred})

Unnamed: 0,y_test,y_predict
5,810000,608500.0
2,610000,554875.0


#### Question:
* 3000 sq ft area, 3 bedrooms, 40 years old
* 2500 sq ft area, 4 bedrooms, 5 years old

In [51]:
print(model.predict([[3000, 3, 40]]))
print(model.predict([[2500, 4, 5]]))

[262000.]
[638125.]




In [52]:
model.coef_

array([    236.25, -175125.  ,  -19125.  ])

In [53]:
model.intercept_

843624.9999999998

In [54]:
from sklearn.metrics import mean_squared_error, r2_score

In [61]:
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2_Score:', r2_score(y_test, y_pred))

MSE: 21820507812.499813
RMSE: 147717.66249335188
R2_Score: -1.1820507812499814
