In [1]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from env import user, password, host
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env
import os
import csv
import wrangle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# the target will be taxamount

In [3]:
zillow_train,zillow_validate,zillow_test=wrangle.wrangled_file()

In [5]:
x_train, y_train = zillow_train.drop(columns='logerror'),zillow_train.logerror
x_validate, y_validate = zillow_validate.drop(columns='logerror'),zillow_validate.logerror
x_test, y_test = zillow_test.drop(columns='logerror'),zillow_test.logerror

In [6]:
x_train=x_train.drop(columns=['propertylandusedesc','county','propertycountylandusecode'])
x_validate=x_validate.drop(columns=['propertylandusedesc','county','propertycountylandusecode'])
x_test=x_test.drop(columns=['propertylandusedesc','county','propertycountylandusecode'])             

In [7]:
scaler=MinMaxScaler()
x_train_scaled = pd.DataFrame(scaler.fit_transform(x_train), index=x_train.index, columns=x_train.columns)
x_validate_scaled = pd.DataFrame(scaler.fit_transform(x_validate), index=x_validate.index, columns=x_validate.columns)
x_test_scaled = pd.DataFrame(scaler.fit_transform(x_test), index=x_test.index, columns=x_test.columns)

In [8]:
y_test

18656    0.012716
59685    0.014778
15991    0.024403
70854   -0.040040
25383    0.095056
           ...   
62489    0.267016
8931     0.063113
17683    0.049941
56270    0.072429
37917    0.003813
Name: logerror, Length: 14596, dtype: float64

In [9]:
# We need y_train and y_validate to be dataframes to append the new columns with predicted values. 
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test=pd.DataFrame(y_test)

# predict mean
y_train['baseline'] = y_train['logerror'].mean()
y_validate['baseline'] = y_validate['logerror'].mean()

# predict median
y_train['logerror_med'] = y_train['logerror'].median()
y_validate['logerror_med'] = y_validate['logerror'].median()

y_test['baseline'] = y_test['logerror'].mean()
y_test['logerror_med'] = y_test['logerror'].median()


# RMSE of mean
rmse_train = mean_squared_error(y_train.logerror, y_train.baseline)**(1/2)
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.baseline)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# RMSE of median
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_med)**(1/2)
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_med)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))



RMSE using Mean
Train/In-Sample:  0.16 
Validate/Out-of-Sample:  0.17
RMSE using Median
Train/In-Sample:  0.16 
Validate/Out-of-Sample:  0.17


In [10]:
y_train

Unnamed: 0,logerror,baseline,logerror_med
39641,0.112968,0.015682,0.006333
59493,-0.036052,0.015682,0.006333
9448,-0.001262,0.015682,0.006333
69877,0.079616,0.015682,0.006333
22602,-0.043681,0.015682,0.006333
...,...,...,...
66181,0.048336,0.015682,0.006333
9172,0.037807,0.015682,0.006333
61419,0.021510,0.015682,0.006333
36522,0.035537,0.015682,0.006333


In [11]:
models = pd.DataFrame(
[
    {
        'model': 'baseline',
        'rmse': mean_squared_error(y_train['logerror'], y_train.baseline,squared=False),
        'r^2': explained_variance_score(y_train['logerror'], y_train.baseline)
    
    }
])
models

Unnamed: 0,model,rmse,r^2
0,baseline,0.16153,0.0


In [12]:
def modeling(model, 
                  x_train, 
                  y_train, 
                  x_validate, 
                  y_validate, 
                  scores=models):
    model.fit(x_train, y_train.logerror)
    in_sample_pred = model.predict(x_train)
    out_sample_pred = model.predict(x_validate)
    model_name = input('model name?')
    y_train[model_name] = in_sample_pred
    y_validate[model_name] = out_sample_pred
    print(y_validate.shape)
    print(out_sample_pred.shape)
    rmse_val = mean_squared_error(
    y_validate.logerror, out_sample_pred, squared=False)
    r_squared_val = explained_variance_score(
        y_validate.logerror, out_sample_pred)
    return models.append({
        'model': model_name,
        'rmse': rmse_val,
        'r^2': r_squared_val
    
    }, ignore_index=True)

In [13]:
models = modeling(LinearRegression(normalize=True), 
                  x_train_scaled, 
                  y_train, 
                  x_validate_scaled, 
                  y_validate, 
                  scores=models)

model name?
(17508, 4)
(17508,)


In [14]:
models = modeling(LassoLars(alpha=1.0), 
                  x_train_scaled, 
                  y_train, 
                  x_validate_scaled, 
                  y_validate, 
                  scores=models)

model name?
(17508, 4)
(17508,)


In [15]:
polyfeats = PolynomialFeatures(degree=2)
x_train_quad = polyfeats.fit_transform(x_train_scaled)
x_val_quad = polyfeats.transform(x_validate_scaled)
models = modeling(LinearRegression(), 
                  x_train_quad, 
                  y_train, 
                  x_val_quad, 
                  y_validate, 
                  scores=models)

model name?
(17508, 4)
(17508,)


In [16]:
models = modeling(TweedieRegressor(power=0, alpha=0), 
                  x_train_scaled, 
                  y_train, 
                  x_validate_scaled, 
                  y_validate, 
                  scores=models)

model name?
(17508, 4)
(17508,)


In [17]:
models

Unnamed: 0,model,rmse,r^2
0,baseline,0.1615296,0.0
1,,17207250000.0,-6.599791e+21
2,,0.1695256,-2.220446e-16
3,,78771580.0,-1.122519e+17
4,,0.1693422,0.002174739


In [18]:
x_train_scaled

Unnamed: 0,bathroom,bedroom,calculatedbathnbr,sqtft,finishedsquarefeet12,fips,fullbathcnt,latitude,longitude,rawcensustractandblock,...,yearbuilt,structuretaxvaluedollarcnt,taxvalue,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,taxrate,month,age
39641,0.117647,0.214286,0.117647,0.075621,0.075621,0.0,0.117647,0.372515,0.687789,0.006126,...,0.826087,0.019125,0.003579,0.0,0.000762,0.004272,1.071292e-05,0.027144,0.500,0.173913
59493,0.058824,0.214286,0.058824,0.092913,0.092913,0.0,0.058824,0.323974,0.600220,0.007440,...,0.623188,0.022796,0.004870,0.0,0.001469,0.005273,1.301030e-05,0.024986,0.750,0.376812
9448,0.058824,0.214286,0.058824,0.056624,0.056624,0.0,0.058824,0.602521,0.491123,0.000191,...,0.920290,0.032316,0.007142,0.0,0.002223,0.007517,3.334176e-07,0.024603,0.125,0.079710
69877,0.000000,0.214286,0.000000,0.033568,0.033568,0.0,0.000000,0.452751,0.732064,0.005403,...,0.579710,0.011140,0.006716,0.0,0.005182,0.007571,9.447188e-06,0.026307,0.875,0.420290
22602,0.000000,0.142857,0.000000,0.025914,0.025914,0.0,0.000000,0.419238,0.645406,0.005875,...,0.289855,0.011918,0.005557,0.0,0.003896,0.006673,1.027283e-05,0.027854,0.250,0.710145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66181,0.117647,0.142857,0.117647,0.051275,0.051275,0.0,0.117647,0.570883,0.456759,0.000460,...,0.717391,0.021776,0.007542,0.0,0.004309,0.007964,8.045278e-07,0.024718,0.875,0.282609
9172,0.117647,0.214286,0.117647,0.074607,0.074607,0.0,0.117647,0.814834,0.732162,0.010954,...,0.833333,0.019112,0.003577,0.0,0.000762,0.005325,1.915506e-05,0.033849,0.125,0.166667
61419,0.000000,0.357143,0.000000,0.128003,0.128003,0.0,0.000000,0.446702,0.615127,0.001774,...,0.202899,0.021677,0.012914,0.0,0.009707,0.013554,3.101554e-06,0.024845,0.750,0.797101
36522,0.058824,0.214286,0.058824,0.072762,0.072762,0.0,0.058824,0.366012,0.561663,0.007033,...,0.615942,0.043999,0.034871,0.0,0.028140,0.032592,1.229812e-05,0.022328,0.500,0.384058


In [19]:
kbest = SelectKBest(f_regression, k=3)
kbest.fit(x_train_scaled, y_train.logerror)
mask = x_train_scaled.columns[kbest.get_support()].to_list()

In [20]:
models = modeling(LinearRegression(), 
                  x_train_scaled[mask], 
                  y_train, 
                  x_validate_scaled[mask], 
                  y_validate, 
                  scores=models)

model name?
(17508, 4)
(17508,)


In [21]:
models

Unnamed: 0,model,rmse,r^2
0,baseline,0.1615296,0.0
1,,17207250000.0,-6.599791e+21
2,,0.1695256,-2.220446e-16
3,,78771580.0,-1.122519e+17
4,,0.1693422,0.002174739
5,,0.1693065,0.002315965


In [22]:
x_test.shape,y_test.shape

((14596, 22), (14596, 3))

In [23]:
kbest = SelectKBest(f_regression, k=20)
kbest.fit(x_test_scaled, y_test.logerror)


SelectKBest(k=20, score_func=<function f_regression at 0x14f816a60>)

In [26]:
mask = x_train_scaled.columns[kbest.get_support()].to_list()
models = modeling_test(LinearRegression(), 
                  x_test_scaled[mask], 
                  y_test, 
                  scores=models)

model name?


In [27]:
models

Unnamed: 0,model,rmse,r^2
0,,0.164591,0.005052


In [None]:
polyfeats = PolynomialFeatures(degree=2)
x_test_quad = polyfeats.fit_transform(x_test_scaled)

In [None]:
def modeling(model, 
                  x_test, 
                  y_test, 
                  scores=models):
    model.fit(x_test, y_test)
    in_sample_pred = model.predict(x_test)
    model_name = input('model_name?')
    y_test[model_name] = in_sample_pred
    rmse_val = mean_squared_error(
    y_test, in_sample_pred, squared=False)**(1/2)
    r_squared_val = explained_variance_score(
        y_test, in_sample_pred)
    return models.append({
        'model': model_name,
        'rmse': rmse_val,
        'r^2': r_squared_val
    
    }, ignore_index=True)

In [None]:
modeling(TweedieRegressor(power=0, alpha=0), 
                  x_test_scaled,
                  y_test,
                  scores=models)

In [None]:
polyfeats = PolynomialFeatures(degree=2)
modeltest=LinearRegression()
modeltest.fit(x_test_scaled, y_test)
x_test = polyfeats.fit_transform(x_test_scaled)

In [None]:
rmse_val = mean_squared_error(
    y_test, modeltest.predict(x_test_scaled), squared=False)
r_squared_val = explained_variance_score(
        y_test, modeltest.predict(x_test_scaled))

In [None]:
rmse_val,r_squared_val

In [None]:
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

# predict mean
y_train['baseline'] = y_train['logerror'].mean()
y_validate['baseline'] = y_validate['logerror'].mean()

# predict median
y_train['logerror_med'] = y_train['logerror'].median()
y_validate['logerror_med'] = y_validate['logerror'].median()

# RMSE of mean
rmse_train = mean_squared_error(y_train.logerror, y_train.baseline)**(1/2)
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.baseline)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# RMSE of median
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_med)**(1/2)
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_med)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))



In [None]:
def x_y_split(zillow_train,zillow_validate,zillow_test):
    x_train, y_train = zillow_train.select_dtypes('float').drop(columns='logerror'),zillow_train.logerror
    x_validate, y_validate = zillow_validate.select_dtypes('float').drop(columns='logerror'),zillow_validate.logerror
    x_test, y_test = zillow_test.select_dtypes('float').drop(columns='logerror'),zillow_test.logerror
    return x_train, y_train,x_validate,y_validate,x_test,y_test



In [None]:
def models(y_train):
    models= pd.DataFrame(
    [
        {
            'model': 'baseline',
            'rmse': mean_squared_error(zillow_train['logerror'], y_train.baseline,squared=False),
            'r^2': explained_variance_score(zillow_train['logerror'], y_train.baseline)

        }
    ])
    return models

In [None]:
def modeling_train(model, 
                  x_train, 
                  y_train, 
                  x_validate, 
                  y_validate, 
                  scores=models):
    model.fit(x_train, y_train.logerror)
    in_sample_pred = model.predict(x_train)
    out_sample_pred = model.predict(x_validate)
    model_name = input('model name?')
    y_train[model_name] = in_sample_pred
    y_validate[model_name] = out_sample_pred
    rmse_val = mean_squared_error(
    y_train.logerror, in_sample_pred, squared=False)
    r_squared_val = explained_variance_score(
        y_train.logerror, in_sample_pred)
    return pd.DataFrame([{
        'model': model_name,
        'rmse': rmse_val,
        'r^2': r_squared_val
    
    }])




In [None]:
def modeling_validate(model, 
                  x_train, 
                  y_train, 
                  x_validate, 
                  y_validate, 
                  scores=models):
    model.fit(x_train, y_train.logerror)
    in_sample_pred = model.predict(x_train)
    out_sample_pred = model.predict(x_validate)
    model_name = input('model name?')
    y_train[model_name] = in_sample_pred
    y_validate[model_name] = out_sample_pred
    rmse_val = mean_squared_error(
    y_validate.logerror, out_sample_pred, squared=False)
    r_squared_val = explained_variance_score(
        y_validate.logerror, out_sample_pred)
    return pd.DataFrame([{
        'model': model_name,
        'rmse': rmse_val,
        'r^2': r_squared_val
    
    }])



In [25]:
def modeling_test(model, 
                  x_test, 
                  y_test,  
                  scores=models):
    model.fit(x_test, y_test.logerror)
    in_sample_pred = model.predict(x_test)
    model_name = input('model name?')
    y_test[model_name] = in_sample_pred
    rmse_val = mean_squared_error(
    y_test.logerror, in_sample_pred, squared=False)
    r_squared_val = explained_variance_score(
        y_test.logerror, in_sample_pred)
    return pd.DataFrame([{
        'model': model_name,
        'rmse': rmse_val,
        'r^2': r_squared_val}])
    

In [None]:
x_train,y_train,x_val,y_val,x_test,y_test=x_y_split(zillow_train,zillow_validate,zillow_test)

In [None]:
models=models(y_train)

In [None]:
modeling_train(LinearRegression(normalize=True), 
                  x_train, 
                  y_train, 
                  x_validate, 
                  y_validate, 
                  scores=models)
