In [None]:
import pandas as pd
import numpy as np
import os
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from env import host, user, password
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.cluster import KMeans

import seaborn as sns
import matplotlib.pyplot as plt
import wrangle

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = wrangle.zillow17()
df

In [None]:
df = df.set_index("parcelid")

In [None]:
df.to_csv("zillow.csv")

In [None]:
df = pd.read_csv('zillow.csv')
df

In [None]:
df = wrangle.wrangle_zillow()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='home_age', y='logerror',
               data=df, hue='county')
plt.title("How does a homes' age compare to logerror\nwithin each county?")
plt.show()

In [None]:
df['structure_dollar_per_sqft'] = df.structure_value / df.sqft

In [None]:
df['land_dollar_per_sqft'] = df.land_value / df.lot_sqft

In [None]:
df['bed_bath_ratio'] = df.bedrooms / df.bathrooms

In [None]:
df.head()

In [None]:
df.structure_dollar_per_sqft.describe()

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='structure_dollar_per_sqft', y='logerror',
               data=df, hue='county')
plt.title("How does structure value per sqft compare to logerror\nwithin each county?")
plt.show()

In [None]:
df.land_dollar_per_sqft.describe()

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='land_dollar_per_sqft', y='logerror',
               data=df, hue='county')
plt.title("How does a land cost per sqft compare to logerror\nwithin each county?")
plt.show()

In [None]:
### Takeaways

In [None]:
df.bed_bath_ratio.describe()

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
sns.scatterplot(x='bed_bath_ratio', y='logerror',
               data=df, hue='county')
plt.title("How does the ratio between bedrooms and bathrooms compare to logerror\nwithin each county?")
plt.show()

In [None]:
sns.relplot(x="land_dollar_per_sqft", y="logerror", col="county", data=df)

In [None]:
df[df['logerror']<-1]

In [None]:
df[df['logerror'] < -1].fips.value_counts()

In [None]:
df.logerror_quartiles.value_counts()

### Lower sqft and logerror

In [None]:
df.sqft.describe()

In [None]:
df[df['sqft'] < 1500].logerror.value_counts()

In [None]:
df['sqft_binned'] = pd.qcut(df.sqft, q=3, labels=['sm_sqft', 'med_sqft', 'lg_sqft'])

df.head()

In [None]:
df.sqft_binned.unique()

In [None]:
plt.rcParams['figure.figsize'] = (8.0, 15.0)
sns.scatterplot(x='logerror', y='home_age',
               data=df, hue='sqft_binned')
plt.title("How does the ratio between bedrooms and bathrooms compare to logerror\nwithin each county?")
plt.show()

In [None]:
sns.pairplot(df, vars=["logerror", "sqft", "sqft_binned", "zip_code"])

In [None]:
def correlation_exploration(train, x_string, y_string):
    '''
    This function takes in a df, a string for an x-axis variable in the df, 
    and a string for a y-axis variable in the df and displays a scatter plot, the r-
    squared value, and the p-value. It explores the correlation between input the x 
    and y variables.
    '''
    r, p = stats.pearsonr(train[x_string], train[y_string])
    df.plot.scatter(x_string, y_string)
    plt.title(f"{x_string}'s Relationship with {y_string}")
    print(f'The p-value is: {p}. There is {round(p,3)}% chance that we see these results by chance.')
    print(f'r = {round(r, 2)}')
    plt.show()

In [None]:
correlation_exploration(df, 'sqft', 'logerror')

In [None]:
df[df.sqft_binned == 'sm_sqft'].logerror.median()

In [None]:
df[df.sqft_binned == 'med_sqft'].logerror.median()

In [None]:
df[df.sqft_binned == 'lg_sqft'].logerror.median()

In [None]:
df.assessmentyear.value_counts()

In [None]:
df.describe().T

### Split and scale

In [None]:
def split(df, target_var):
    '''
    This function takes in the dataframe and target variable name as arguments and then
    splits the dataframe into train (56%), validate (24%), & test (20%)
    It will return a list containing the following dataframes: train (for exploration), 
    X_train, X_validate, X_test, y_train, y_validate, y_test
    '''
    # split df into train_validate (80%) and test (20%)
    train_validate, test = train_test_split(df, test_size=.20, random_state=123)
    # split train_validate into train(70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

    # create X_train by dropping the target variable 
    X_train = train.drop(columns=[target_var])
    # create y_train by keeping only the target variable.
    y_train = train[[target_var]]

    # create X_validate by dropping the target variable 
    X_validate = validate.drop(columns=[target_var])
    # create y_validate by keeping only the target variable.
    y_validate = validate[[target_var]]

    # create X_test by dropping the target variable 
    X_test = test.drop(columns=[target_var])
    # create y_test by keeping only the target variable.
    y_test = test[[target_var]]

    partitions = [train, X_train, X_validate, X_test, y_train, y_validate, y_test]
    return partitions

In [None]:
partitions = split(df, target_var='logerror')

In [None]:
# the variables that still need scaling
scaled_vars = ['sm_sqft', 'lg_sqft', 'home_age', 'structure_dollar_per_sqft']

# create new column names for the scaled variables by adding 'scaled_' to the beginning of each variable name 
scaled_column_names = ['scaled_' + i for i in scaled_vars]

# select the X partitions: [X_train, X_validate, X_test]
X = partitions[1:4]

# fit the standardscaler to X_train
X_train = X[0]
scaler = StandardScaler(copy=True).fit(X_train[scaled_vars])


def scale_and_concat(df):
    scaled_array = scaler.transform(df[scaled_vars])
    scaled_df = pd.DataFrame(scaled_array, columns=scaled_column_names, index=df.index.values)
    return pd.concat((df, scaled_df), axis=1)

for i in range(len(X)):
    X[i] = scale_and_concat(X[i])