In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib widget
import statsmodels.api as sm
import sklearn.linear_model as lm
from sklearn.metrics import mean_squared_error

## Вибір і підрахунок статистик 

In [2]:
sales = pd.read_csv('./home_data.csv')

In [44]:
sales

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [45]:
max_price = sales.groupby('zipcode')['price'].max().reset_index()

In [46]:
max_price

Unnamed: 0,zipcode,price
0,98001,850000
1,98002,389000
2,98003,950000
3,98004,7062500
4,98005,1960000
...,...,...
65,98177,3800000
66,98178,1700000
67,98188,707000
68,98198,1350000


In [47]:
min_max_price = max_price.loc[max_price['price'].idxmin()]
min_max_price

zipcode     98002
price      389000
Name: 1, dtype: int64

## Фільтрація 

In [48]:
print(sales['floors'] == 2)

two_floors = len(sales[sales['floors'] == 2])
print(two_floors)

total_records = len(sales)

percentage = (two_floors / total_records) * 100

print(percentage)

0        False
1         True
2        False
3        False
4        False
         ...  
21608    False
21609     True
21610     True
21611     True
21612     True
Name: floors, Length: 21613, dtype: bool
8241
38.12982926942119


## Регресійні моделі

In [49]:
training_set = sales.sample( frac=0.8, random_state = 0)
test_set = sales[ ~sales.id.isin(training_set.id) ]

In [50]:
training_sqft  = training_set[ ['sqft_living'] ]
training_price = training_set[ 'price' ]

sqft_model   = lm.LinearRegression()
sqft_model.fit( training_sqft, training_price)

In [51]:
test_price = test_set[ 'price' ]
test_sqft  = test_set[ ['sqft_living'] ]

predicted_price = sqft_model.predict( test_sqft )

In [52]:
print(test_set['price'].mean())

538921.2583294228


In [53]:
print ('MSE  = ', mean_squared_error( test_price, predicted_price))
print ('RMSE = ', np.sqrt( mean_squared_error( test_price, predicted_price) ))

MSE  =  63587361914.57968
RMSE =  252165.34637927488


In [54]:
intercept = sqft_model.intercept_
coefficient = sqft_model.coef_[0]

print(f"y = {intercept} + {coefficient}x")

y = -46927.7680322011 + 282.3404167134322x


In [55]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [56]:
sales[my_features]

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,zipcode
0,3,1.00,1180,5650,1.0,98178
1,3,2.25,2570,7242,2.0,98125
2,2,1.00,770,10000,1.0,98028
3,4,3.00,1960,5000,1.0,98136
4,3,2.00,1680,8080,1.0,98074
...,...,...,...,...,...,...
21608,3,2.50,1530,1131,3.0,98103
21609,4,2.50,2310,5813,2.0,98146
21610,2,0.75,1020,1350,2.0,98144
21611,3,2.50,1600,2388,2.0,98027


In [57]:
training_features = training_set[ my_features ]
training_price = training_set[ 'price' ]

test_features = test_set[ my_features ]

my_features_model = lm.LinearRegression()
my_features_model.fit(training_features, training_price)

predicted_price_by_features = my_features_model.predict( test_features )

In [58]:
labs_features = ['sqft_living', 'sqft_above', 'sqft_basement']

In [59]:
train_features = training_set[ labs_features ]
train_price = training_set[ 'price' ]

t_features = test_set[ labs_features ]

labs_features_model = lm.LinearRegression()
labs_features_model.fit(train_features, train_price)

predicted_price_by_labs_features = labs_features_model.predict( t_features )

## Comparing the results of the models

In [60]:
print ("Their RMSE = " , np.sqrt( mean_squared_error( test_price, predicted_price_by_features) ))
print ("My RMSE = " , np.sqrt( mean_squared_error( test_price, predicted_price_by_labs_features) ))

Their RMSE =  248530.93398183392
My RMSE =  251992.11704221854
