## S14a: Lab 2 - Pricing, houses


### 1. Let's start by reading in our data

In [None]:
# Imports

import numpy as np
import pandas as pd

In [None]:
# Read in data with pandas

data = pd.read_csv('./data/Boston_March2018.csv')

# Preview

data.head()

In [None]:
# View from another angle

data.iloc[100:].head()

### 2. Explore the data

In [None]:
# Narrow the view

prices = ['LISTPRICE', 'SOLDPRICE']
data[prices].head()

In [None]:
# !YOUR TURN!
# Look at sum


# Look at mean


In [None]:
# Let's take a look at the property types

proptypes = data.PROPTYPE.unique()
proptypes

In [None]:
# Tinker around with some filtering options

data[(data.SOLDPRICE < 1000000) & (data.PROPTYPE == "SF")]

In [None]:
# Look at everything

data.describe()

In [None]:
# Group by prop type

data_by_proptype = data.groupby('PROPTYPE')
data_by_proptype.describe()

In [None]:
# Zoom in

data_by_proptype[['LISTPRICE', 'SOLDPRICE']].mean()

### 3. Introducing [Linear Regression](https://scikit-learn.org/stable/modules/linear_model.html) with Scikit Learn

In [None]:
# Import sklearn

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Define model

regr = linear_model.LinearRegression()


In [None]:
# Define label and features (adjustable)

label = ['SOLDPRICE']
features = ['BEDS', 'BATHS', 'SQFT', 'AGE', 'LOTSIZE', 'GARAGE']


In [None]:
# Filter out empties

filter_data = data.copy()

# Additional
filter_data = filter_data[filter_data['SOLDPRICE'] < 1000000]


# Apply to all
for f in features + label:
    # Out empties - (v. imputation)
    filter_data = filter_data[filter_data[f].notna()]
    
    # Interaction terms
    
    # Standardize
    filter_data[f] = (filter_data[f] - filter_data[f].mean()) / filter_data[f].std()
    
    # v. Normalize
    filter_data[f] = (filter_data[f] - filter_data[f].min()) / (filter_data[f].max() - filter_data[f].min())
    
filter_data = filter_data.reset_index()

print(filter_data[label + features].head())


In [None]:
# Extract features and label

label = filter_data[label]
features = filter_data[features]


In [None]:
# !YOUR TURN!

# Take a look at the shapes for 'labels' and 'features'


In [None]:
# Set up testing and training (adjustable)
split = int(len(filter_data) * 0.25)

features_train = features[:split]
features_test = features[split:]
label_train = label[:split]
label_test = label[split:]

# Check shapes

print ("Train shape")
print (features_train.shape, label_train.shape)
print ("\nTest shape")
print (features_test.shape, label_test.shape)

In [None]:
# Fit training sets

regr.fit(features_train, label_train)


In [None]:
# Make predictions using testing set

label_pred = regr.predict(features_test)

# Explained variance score: 1 is perfect prediction

print('Variance score: %.2f' % r2_score(label_test, label_pred))


### 4. Plot, aka "Visualize"

In [None]:
# Import
import matplotlib.pyplot as plt

In [None]:

# Plot outputs
plt.figure(figsize=(20, 5))
plt.scatter(label_test, label_pred, color='black')
plt.plot([label_test.min(), label_test.max()], [label_test.min(), label_test.max()], 'k--', lw=4, color='blue')

plt.ylabel('Predicted')
plt.xlabel('Measured')

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
# Residual plot

plt.figure(figsize=(10, 7))

resid = label_test - regr.predict(features_test)

plt.axhline(y=0, linestyle='-', linewidth=2, color="r")
plt.scatter(x=label_pred, y=resid, alpha=0.5, s=3)

plt.title("Residual plot")
plt.ylabel(r"Y-$\hat Y$")
plt.xlabel(r'$\hat Y$')
plt.tight_layout()
plt.show()

### 7. Export

In [None]:
# Download model as pkl file

import joblib
joblib.dump(regr, 'regr.pkl')

# HOMEWORK 1

### 6. Train Test Split

In [None]:
# YOUR TURN
# Use sklearn train|test|split
# Ref. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html


# from sklearn.model_selection import train_test_split


# Init x, y test,train @ 33% size (sklearn)


# Determine linear_model (sklearn - above))


# Train linear_model (sklearn)


# Score Method 1: (sklearn)
# Ref. https://stackoverflow.com/questions/45529907/difference-between-model-score-vs-r2-score
#  https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

# Score Method 2 (sklearn)


# Plot outputs (matplotlab - above)

# Download model as pkl file


### 7. Decision Tree

In [None]:
# YOUR TURN
# Run DecisionTreeRegressor
# HINT: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html


# Create regressor


# Fit


# Predictions


# Score


# Plot outputs (matplotlab - above)


# Download model as pkl file


### Ref
    + https://stackoverflow.com/questions/26414913/normalize-columns-of-pandas-data-frame