# House Sales alias Bill Gates

In [None]:
import pandas as pd

# Load house sales data

In [None]:
sales = pd.read_csv("data/home_data.csv")

In [None]:
sales

# Explore

In [None]:
sales.head()

In [None]:
sales_set = sales[['sqft_living','price']]
sales_set

In [None]:
sales.describe()

In [None]:
import matplotlib.pyplot as plt
import pylab as pl
%matplotlib inline

In [None]:
sales_set.hist()
plt.show()

In [None]:
plt.scatter(sales_set.sqft_living, sales_set.price,  color='blue')
plt.xlabel("Sqft living")
plt.ylabel("Price")
plt.show()

# Simple regression model that predicts price from square feet

In [None]:
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
x = sales['sqft_living'].to_numpy().reshape(-1, 1)
y = sales['price'].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

## train simple regression model

In [None]:
sqft_model = LinearRegression().fit(x_train, y_train)

# Evaluate the quality of our model

In [None]:
print(sqft_model.score(x_train, y_train))
print(sqft_model.score(x_test, y_test))

# Explore model a little further

In [None]:
print(sqft_model.intercept_)
print(sqft_model.coef_)

In [None]:
plt.scatter(sales_set.sqft_living, sales_set.price,  color='blue')
plt.plot(x_train, sqft_model.coef_[0] * x_train + sqft_model.intercept_, '-r')
plt.xlabel("Sqft living")
plt.ylabel("Price")
plt.show()

In [None]:
price_pred = sqft_model.predict(x_test)
price_pred

In [None]:
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, price_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, price_pred))

In [None]:
plt.scatter(x_test, y_test, color="black")
plt.plot(x_test, price_pred, color="blue", linewidth=3)
plt.xlabel("Sqft living")
plt.ylabel("Price")
plt.show()

# Explore other features of the data

In [None]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [None]:
sales[my_features].head()

In [None]:
sales[my_features].hist()
plt.show()

In [None]:
x_multi = np.asanyarray(sales[my_features])
y_multi = np.asanyarray(sales['price'])
x_multi_train, x_multi_test, y_multi_train, y_multi_test = train_test_split(x_multi, y_multi, test_size=0.2, random_state=0)

# Build a model with these additional features

In [None]:
multi_model = LinearRegression().fit(x_multi_train, y_multi_train)

# Compare simple model with more complex one

In [None]:
print(my_features)

In [None]:
print(sqft_model.score(x_train, y_train))
print(sqft_model.score(x_test, y_test))

print(multi_model.score(x_multi_train, y_multi_train))
print(multi_model.score(x_multi_test, y_multi_test))

# Apply learned models to make predictions

In [None]:
house1 = sales[sales['id'] == 5309101200]

In [None]:
house1

<img src="img/house1.png" width=400px>

In [None]:
print(house1['price'])
print(house1['sqft_living'])

In [None]:
price_pred = sqft_model.predict(house1['sqft_living'].to_numpy().reshape(-1, 1))
price_pred[0]

In [None]:
print(f"Price house ID {int(house1['id'].iloc[0])} is ${multi_model.predict(np.asanyarray(house1[my_features]))[0]:.2f}")

## Prediction for a second house, a fancier one

In [None]:
house2 = sales[sales['id']==1925069082]

In [None]:
house2

<img src="img/house2.jpg" width=400px>

In [None]:
price_pred = sqft_model.predict(house2['sqft_living'].to_numpy().reshape(-1, 1))
print(price_pred[0])

In [None]:
print(f"Price house ID {int(house2['id'].iloc[0])} is ${multi_model.predict(np.asanyarray(house2[my_features]))[0]:.2f}")

## Prediction for a super fancy home

In [None]:
bill_gates = {'bedrooms':[8], 
              'bathrooms':[25], 
              'sqft_living':[50000], 
              'sqft_lot':[225000],
              'floors':[4], 
              'zipcode':['98039'], 
              'condition':[10], 
              'grade':[10],
              'waterfront':[1],
              'view':[4],
              'sqft_above':[37500],
              'sqft_basement':[12500],
              'yr_built':[1994],
              'yr_renovated':[2010],
              'lat':[47.627606],
              'long':[-122.242054],
              'sqft_living15':[5000],
              'sqft_lot15':[40000]}

In [None]:
df_bill_gates = pd.DataFrame.from_dict(bill_gates)
df_bill_gates['sqft_living']

<img src="img/bill_gates.png" width=600px>

In [None]:
price_pred = sqft_model.predict(df_bill_gates['sqft_living'].to_numpy().reshape(-1, 1))
print(price_pred[0])

In [None]:
print(f"Price house ID Bill Gates is ${multi_model.predict(np.asanyarray(df_bill_gates[my_features]))[0]:.2f}")

In [None]:
adv_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode', 'condition', 'grade', 'waterfront', 'view', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

In [None]:
x_adv = np.asanyarray(sales[adv_features])
y_adv = np.asanyarray(sales['price'])
x_adv_train, x_adv_test, y_adv_train, y_adv_test = train_test_split(x_adv, y_adv, test_size=0.2, random_state=0)

In [None]:
adv_model = LinearRegression().fit(x_adv_train, y_adv_train)

In [None]:
print(sqft_model.score(x_train, y_train))
print(sqft_model.score(x_test, y_test))

print(multi_model.score(x_multi_train, y_multi_train))
print(multi_model.score(x_multi_test, y_multi_test))

print(adv_model.score(x_adv_train, y_adv_train))
print(adv_model.score(x_adv_test, y_adv_test))

In [None]:
print(f"Price house ID Bill Gates is ${adv_model.predict(np.asanyarray(df_bill_gates[adv_features]))[0]:.2f}")