In [107]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

In [108]:
sales = pd.read_csv(r"house_data.csv")

In [109]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# Question 1

In [110]:
sales_1 = sales[sales['zipcode'] == 98039]

In [111]:
sales_1['price'].mean()

2160606.6

# Question 2

In [112]:
sales_2 = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)]

In [113]:
fraction = len(sales_2) / len(sales)

In [114]:
print("The fraction of all houses having ‘sqft_living’ in this range are : ", fraction)

The fraction of all houses having ‘sqft_living’ in this range are :  0.4215518437977143


# Question 3

In [115]:
training_set, test_set = train_test_split(sales, test_size=0.2, shuffle=False, random_state = 0) 

## with 6 features.

In [117]:
reg = LinearRegression(fit_intercept=True)

In [118]:
my_features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','zipcode']

In [119]:
reg.fit(sales.loc[:, my_features], sales['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [120]:
y_pred = reg.predict(test_set.loc[:, my_features])

In [130]:
RMSE_basic = np.sqrt(metrics.mean_squared_error(test_set['price'], y_pred))
print('Root Mean Squared Error of the predictions with basic features is : ', RMSE_basic)

Root Mean Squared Error of the predictions with basic features is :  255648.11680947174


In [131]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [132]:
reg_1 = LinearRegression(fit_intercept=True)

In [133]:
reg_1.fit(sales.loc[:, advanced_features], sales['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [134]:
y_pred_1 = reg_1.predict(test_set.loc[:, advanced_features])

In [135]:
RMSE_advanced = np.sqrt(metrics.mean_squared_error(test_set['price'], y_pred_1))
print('Root Mean Squared Error of the predictions with advanced features is : ', RMSE_advanced)

Root Mean Squared Error of the predictions with advanced features is :  203743.06930648003


In [136]:
RMSE_diff = RMSE_basic - RMSE_advanced

In [137]:
print("difference in RMSE between the model trained with my_features and the one trained with advanced_features is : ", RMSE_diff)

difference in RMSE between the model trained with my_features and the one trained with advanced_features is :  51905.047502991714
