In [1]:
import pandas as pd
import numpy as np
import os
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from env import host, user, password
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.cluster import KMeans

import seaborn as sns
import matplotlib.pyplot as plt
import wrangle

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = wrangle.zillow17()
df

ValueError: unsupported format character ''' (0x27) at index 1427

In [None]:
df = df.set_index("parcelid")

In [None]:
df.to_csv("zillow.csv")

In [None]:
df = pd.read_csv('zillow.csv')
df

In [None]:
df = wrangle.wrangle_zillow()

In [None]:
df.info()

In [None]:
df.describe().T

### Data Prep Takeaways
- Dropped unnecessary/redundant columns: 'id', 'calculatedbathnbr', 'finishedsquarefeet12', 'fullbathcnt', 'heatingorsystemtypeid',                                              'propertycountylandusecode', 'propertylandusetypeid','propertyzoningdesc',
                                         'censustractandblock', 'propertylandusedesc', 'heatingorsystemdesc'
- Set our outliers to taxvaluedollarcnt < 5_000_000 and calculatedfinishedsquarefeet < 12500
- Dropped 5756 rows

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='home_age', y='logerror',
               data=df, hue='county')
plt.title("How does a homes' age compare to logerror\nwithin each county?")
plt.show()

In [None]:
df['structure_dollar_per_sqft'] = df.structure_value / df.sqft

In [None]:
df['land_dollar_per_sqft'] = df.land_value / df.lot_sqft

In [None]:
df['bed_bath_ratio'] = df.bedrooms / df.bathrooms

In [None]:
df.head()

In [None]:
df.structure_dollar_per_sqft.describe()

In [None]:
# Scatterplot to visualize how the homes' cost per sqft affects the logerror per county

plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='structure_dollar_per_sqft', y='logerror',
               data=df, hue='county')
plt.title("How does structure value per sqft compare to logerror\nwithin each county?")
plt.show()

In [None]:
df.land_dollar_per_sqft.describe()

In [None]:
# Scatterplot to visualize how the lots' cost per sqft affects the logerror per county

plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='land_dollar_per_sqft', y='logerror',
               data=df, hue='county')
plt.title("How does a land cost per sqft compare to logerror\nwithin each county?")
plt.show()

In [None]:
### Takeaways

In [None]:
df.bed_bath_ratio.describe()

In [None]:
# Scatterplot to visualize how the bed/bath ratio affects the logerror per county

plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='bed_bath_ratio', y='logerror',
               data=df, hue='county')
plt.title("How does the ratio between bedrooms and bathrooms compare to logerror\nwithin each county?")
plt.show()

In [None]:
sns.relplot(x="land_dollar_per_sqft", y="logerror", col="county", data=df)

In [None]:
df[df['logerror']<-1]

In [None]:
df[df['logerror'] < -1].fips.value_counts()

In [None]:
df.logerror_quartiles.value_counts()

In [None]:
df.logerror_quartiles.max()

### Lower sqft and logerror

In [None]:
df.sqft.describe()

In [None]:
df[df['sqft'] < 1500].logerror.value_counts()

In [None]:
df['sqft_binned'] = pd.qcut(df.sqft, q=3, labels=['sm_sqft', 'med_sqft', 'lg_sqft'])

df.head()

In [None]:
df.sqft_binned.unique()

In [None]:
plt.rcParams['figure.figsize'] = (8.0, 15.0)
sns.scatterplot(x='logerror', y='home_age',
               data=df, hue='sqft_binned')
plt.title("How does the ratio between bedrooms and bathrooms compare to logerror\nwithin each county?")
plt.show()

In [None]:
def correlation_exploration(train, x_string, y_string):
    '''
    This function takes in a df, a string for an x-axis variable in the df, 
    and a string for a y-axis variable in the df and displays a scatter plot, the r-
    squared value, and the p-value. It explores the correlation between input the x 
    and y variables.
    '''
    r, p = stats.pearsonr(train[x_string], train[y_string])
    df.plot.scatter(x_string, y_string)
    plt.title(f"{x_string}'s Relationship with {y_string}")
    print(f'The p-value is: {p}. There is {round(p,3)}% chance that we see these results by chance.')
    print(f'r = {round(r, 2)}')
    plt.show()

In [None]:
correlation_exploration(df, 'sqft', 'logerror')

In [None]:
df[df.sqft_binned == 'sm_sqft'].logerror.median()

In [None]:
df['sm_sqft'] = df.sqft_binned == 'sm_sqft'

In [None]:
df[df.sqft_binned == 'med_sqft'].logerror.median()

In [None]:
df[df.sqft_binned == 'lg_sqft'].logerror.median()

In [None]:
df['lg_sqft'] = df.sqft_binned == 'lg_sqft'

In [None]:
df.info()

In [None]:
df.assessmentyear.value_counts()

In [None]:
df.describe().T

### Split and scale

In [None]:
from sklearn.model_selection import train_test_split

# Create test dataset

train_validate, test = train_test_split(df, 
                                        test_size=.2, 
                                        random_state=123) 
                                        
train, validate = train_test_split(train_validate, 
                                   test_size=.3, 
                                   random_state=123) 
                                   

# Take a look at your split datasets

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

In [None]:
# 

X_train = train[['structure_dollar_per_sqft', 'home_age', 'home_value']]

y_train = train['logerror']

X_validate = validate[['structure_dollar_per_sqft', 'home_age', 'home_value']]

y_validate = validate['logerror']

X_test = test[['structure_dollar_per_sqft', 'home_age', 'home_value']]

y_test = test['logerror']

In [None]:
X_train.shape, X_validate.shape, X_test.shape

In [None]:
# Robust scaler

def Robust_Scaler(X_train, X_validate, X_test):
    """
    Takes in X_train, X_validate and X_test dfs with numeric values only
    Returns scaler, X_train_scaled, X_validate_scaled, X_test_scaled dfs
    """
    scaler = sklearn.preprocessing.RobustScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate), index = X_validate.index, columns = X_validate.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    return scaler, X_train_scaled, X_validate_scaled, X_test_scaled

In [None]:
# Use the Standard_Scaler Function defined above

scaler, X_train_scaled, X_validate_scaled, X_test_scaled = Robust_Scaler(X_train, X_validate, X_test)

X_train_scaled.head()

In [None]:
X_train_scaled.describe()

### Quick check

In [None]:
import matplotlib
import sklearn

print('Versions')
print('Pandas:', pd.__version__)
print('Numpy:', np.__version__)
print('Matplotlib:', matplotlib.__version__)
print('Seaborn:', sns.__version__)
print('Scikit-Learn:', sklearn.__version__)

In [None]:
# Make sure y is what I think it is

y_train.head()

In [None]:
# Assign y to a dataset

y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

In [None]:
# Establish my baseline

# 1. Compute logerror_pred_mean
logerror_pred_mean = y_train.logerror.mean()
y_train['logerror_pred_mean'] = logerror_pred_mean
y_validate['logerror_pred_mean'] = logerror_pred_mean

# 2. Compute logerror_pred_median
logerror_pred_median = y_train.logerror.median()
y_train['logerror_pred_median'] = logerror_pred_median
y_validate['logerror_pred_median'] = logerror_pred_median

# 3. RMSE of logerror_pred_mean
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_mean) ** .5
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_mean) ** .5

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 5), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 5))

# 4. RMSE of logerror_pred_median
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_median) ** .5
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_median) ** .5
print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 5), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 5))

In [None]:
# Create a dataframe for comparison of metrics for model selection

from sklearn.metrics import explained_variance_score

metric_df = pd.DataFrame(data=[{
    'model': 'mean_baseline', 
    'RMSE_validate': rmse_validate,
    'r^2_validate': explained_variance_score(y_validate.logerror, y_validate.logerror_pred_mean)}])

metric_df

In [None]:
# First model: Linear Regression (OLS)

# Make the thing 
lm = LinearRegression(normalize=True)

# Fit the model to our training data 
lm.fit(X_train, y_train.logerror)

# Predict train 
y_train['logerror_pred_lm'] = lm.predict(X_train)

# Evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lm) ** .5

# Predict validate
y_validate['logerror_pred_lm'] = lm.predict(X_validate)

# Evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lm) ** .5

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
# Add OLS Regressor to my metric dataframe

metric_df = metric_df.append({
    'model': 'OLS Regressor', 
    'RMSE_validate': rmse_validate,
    'r^2_validate': explained_variance_score(y_validate.logerror, y_validate.logerror_pred_lm)}, ignore_index=True)

metric_df

In [None]:
# LassoLars

# Create the model object
lars = LassoLars(alpha=1)

# Fit the model to our training data
lars.fit(X_train, y_train.logerror)

# Predict train
y_train['logerror_pred_lars'] = lars.predict(X_train)

# Evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lars) ** .5

# Predict validate
y_validate['logerror_pred_lars'] = lars.predict(X_validate)

# Evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lars) ** .5

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
# Add LassoLars to my metric dataframe

metric_df = metric_df.append({
    'model': 'Lasso alpha 1', 
    'RMSE_validate': rmse_validate,
    'r^2_validate': explained_variance_score(y_validate.logerror, y_validate.logerror_pred_lars)}, ignore_index=True)

metric_df

In [None]:
# Polynomial features

# Make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# Fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)

# Transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate)
X_test_degree2 = pf.transform(X_test)

In [None]:
# Create the model object
lm2 = LinearRegression(normalize=True)

# Fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series 
lm2.fit(X_train, y_train.logerror)

# Predict train
y_train['logerror_pred_lm2'] = lm2.predict(X_train)

# Evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lm2) ** .5

# Predict validate
y_validate['logerror_pred_lm2'] = lm2.predict(X_validate)

# Evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lm2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
# Add Polynomial model to my metric df

metric_df = metric_df.append({
    'model': 'Quadratic Model', 
    'RMSE_validate': rmse_validate,
    'r^2_validate': explained_variance_score(y_validate.logerror, y_validate.logerror_pred_lm2)}, ignore_index=True)

metric_df

In [None]:
# Create my regressor object
rf = RandomForestRegressor(n_estimators = 4, random_state = 0)
  
# Fit the regressor with x and y data
rf.fit(X_train, y_train.logerror)

# predict train
y_train['logerror_pred_rf'] = rf.predict(X_train)

# Evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_rf) ** .5

# Predict validate
y_validate['logerror_pred_rf'] = rf.predict(X_validate)

# Evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_rf) ** .5

print("RMSE for RandomForestRegressor\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
# Add RandomForestRegressor model to my metric df

metric_df = metric_df.append({
    'model': 'RandomForestRegressor', 
    'RMSE_validate': rmse_validate,
    'r^2_validate': explained_variance_score(y_validate.logerror, y_validate.logerror_pred_rf)}, ignore_index=True)

metric_df

In [None]:
# Plot to visualize actual vs predicted

plt.figure(figsize=(12,8))
plt.xlim(-1.5, 1.5)
plt.hist(y_validate.logerror, bins=50, color='yellow', alpha=.5, label="Actual Logerror")
plt.hist(y_validate.logerror_pred_lm, bins=50, color='blue', alpha=.5, label="Model: LinearRegression")
plt.hist(y_validate.logerror_pred_lars, bins=50, color='red', alpha=.5, label="Model: LassoLars")
plt.hist(y_validate.logerror_pred_lm2, bins=50, color='orange', alpha=.5, label="Model: 2nd degree Polynomial")
plt.hist(y_validate.logerror_pred_rf, bins=50, color='green', alpha=.5, label="Model: RandomForestRegressor")
plt.xlabel("Logerror")
plt.ylabel("")
plt.title("Comparing the Distribution of Actual Logerror to Distributions of Predicted Logerror for the Top Models")
plt.legend()
plt.show()

#### Takeaways:

In [None]:
# Pick the Quadratic Model as my best performer and run my on my test data

y_test = pd.DataFrame(y_test)

# predict on test
y_test['logerror_pred_lm2'] = lm2.predict(X_test)

# evaluate: rmse
rmse_test = mean_squared_error(y_test.logerror, y_test.logerror_pred_lm2) ** .5

print("RMSE for the Quadratic Model using Polynomial Features\nOut-of-Sample Performance: ", rmse_test)