In [1]:
import pandas as pd
import numpy as np
from env import user, password, host
import acquire        
from acquire import url, query
import prepare
from scipy import stats
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_sql(query, url)
df

Unnamed: 0,parcelid,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,taxamount,assessmentyear,regionidcounty,regionidzip,fips,transactiondate
0,11721753,3.0,2.0,1316.0,205123.0,2627.48,2016.0,3101.0,95997.0,6037.0,2017-07-21
1,11289917,3.0,2.0,1458.0,136104.0,2319.90,2016.0,3101.0,97318.0,6037.0,2017-06-23
2,11705026,2.0,1.0,1421.0,35606.0,543.69,2016.0,3101.0,96018.0,6037.0,2017-06-30
3,14269464,4.0,3.0,2541.0,880456.0,9819.72,2016.0,1286.0,96958.0,6059.0,2017-06-01
4,11446756,3.0,2.0,1491.0,107110.0,1399.27,2016.0,3101.0,96162.0,6037.0,2017-08-23
...,...,...,...,...,...,...,...,...,...,...,...
28119,11991766,6.0,2.0,2634.0,878190.0,12766.88,2016.0,3101.0,95985.0,6037.0,2017-08-31
28120,14011468,4.0,1.5,1187.0,465999.0,5552.68,2016.0,1286.0,96180.0,6059.0,2017-08-31
28121,14453399,5.0,3.0,3015.0,852813.0,11662.88,2016.0,1286.0,96983.0,6059.0,2017-08-31
28122,11128688,5.0,3.0,2992.0,1120000.0,13663.03,2016.0,3101.0,96356.0,6037.0,2017-08-31


In [3]:
def prep_zillow_data(df):
    ''' This function preps the data by dropping rows with nulls, correcting datatypes, renaming the columns for better understanding,, 
    drops rows with erroneous entries, drops duplicates, creates a tax_rate column,
     '''
    # Compared to the row count we have more than enough to drop these
    df = df.dropna()
    # Next we can correct data types that are wrong
    df.parcelid = df.parcelid.astype('object')
    df.regionidcounty = df.regionidcounty.astype('object')
    df.regionidzip = df.regionidzip.astype('object')
    df.fips = df.fips.astype('object')
    df.taxvaluedollarcnt, df.assessmentyear = df.taxvaluedollarcnt.astype('int64'), df.assessmentyear.astype('int64')   
    # Next I will rename the columns to be more recognizable
    df = df.rename(columns={"bedroomcnt": "bedrooms", "bathroomcnt": "bathrooms","calculatedfinishedsquarefeet": "area","taxamount": "tax_amount", "taxvaluedollarcnt": "tax_value", "fips": "zipcode", "regionidcounty": "county_id","assessmentyear": "assessment_year", "transactiondate":"transaction_date" })
    # Here I check for erroneous entries and drop them
    df.drop(df[df['bedrooms'] < 1].index, inplace = True)
    df.drop(df[df['bathrooms'] < 1].index, inplace = True)
    df.drop(df[df['area'] < 200].index, inplace = True)
    # calculate the tax rate and make a new column/feature 
    df['tax_rate']= df['tax_amount']/df['tax_value']
    # time to check for duplicates and remove them
    # dropping ALL duplicate values
    df.drop_duplicates(subset ="parcelid",
                         keep = False, inplace = True)
    # calculate the tax rate and make a new column/feature 
    df['tax_rate']= df['tax_amount']/df['tax_value']
    return df

In [4]:
df.shape

(28124, 11)

In [5]:
df = prep_zillow_data(df)

In [11]:
df2 =df

In [12]:
df.shape

(27981, 12)

In [13]:
def train_validate_test(df2, target):
    '''
    this function takes in a dataframe and splits it into 3 samples, 
    a test, which is 20% of the entire dataframe, 
    a validate, which is 24% of the entire dataframe,
    and a train, which is 56% of the entire dataframe. 
    It then splits each of the 3 samples into a dataframe with independent variables
    and a series with the dependent, or target variable. 
    The function returns train, validate, test sets and also another 3 dataframes and 3 series:
    X_train (df2) & y_train (series), X_validate & y_validate, X_test & y_test. 
    '''
    # split df2 into test (20%) and train_validate (80%)
    train_validate, test = train_test_split(df2, test_size=.2, random_state=123)

    # split train_validate off into train (70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

        
    # split train into X (dataframe, drop target) & y (series, keep target only)
    X_train = train.drop(columns=[target])
    y_train = train[target]
    
    # split validate into X (dataframe, drop target) & y (series, keep target only)
    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]
    
    # split test into X (dataframe, drop target) & y (series, keep target only)
    X_test = test.drop(columns=[target])
    y_test = test[target]
    
    return train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test

In [21]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test= train_validate_test(df2, 'tax_amount')

In [22]:
train.head()

Unnamed: 0,parcelid,bedrooms,bathrooms,area,tax_value,tax_amount,assessment_year,county_id,regionidzip,zipcode,transaction_date,tax_rate
28026,11071535,6.0,6.0,10605.0,3272288,39621.85,2016,3101.0,96346.0,6037.0,2017-08-31,0.012108
14422,11711488,1.0,1.0,925.0,91000,1572.93,2016,3101.0,95997.0,6037.0,2017-06-30,0.017285
10888,14456914,3.0,2.5,1460.0,568000,8944.94,2016,1286.0,96998.0,6059.0,2017-06-16,0.015748
5471,12400999,3.0,3.0,2681.0,521873,6384.72,2016,3101.0,96100.0,6037.0,2017-05-24,0.012234
5070,11494686,4.0,4.0,2208.0,1062777,11498.41,2016,3101.0,96109.0,6037.0,2017-05-23,0.010819


In [16]:
validate.head()

NameError: name 'validate' is not defined

In [17]:
X_train

NameError: name 'X_train' is not defined