# Zillow Exploration
After successfully splitting the data using our functions we will explore the data to determine best features to use in our modeling. Per our guidance, the MVP will use square feet of the home, number of bedrooms, and number of bathrooms to attempt to predict the price (measured as taxvaluedollarcnt) of the home. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import wrangle
import prepare
import acquire
print('Success')

Success


# Testing Functions
The below lines of code is used to ensure that the acquire and prepare files are working correctly and to put the zillow data into Dataframes for processing. We will then split the DFs into train, validate, test data sets. 

In [2]:
zillow = acquire.load_zillow_data()
zillow.head()

Unnamed: 0,parcelid,id,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate
0,11393337,2463969,,,,3.0,3.0,,4.0,3.0,...,43439.0,2016.0,22755.0,756.94,Y,14.0,60372350000000.0,117,0.086137,2017-06-08
1,11289917,2061546,1.0,,,2.0,3.0,,6.0,2.0,...,136104.0,2016.0,27214.0,2319.9,Y,15.0,60379010000000.0,1248,-0.362001,2017-06-23
2,11705026,1834372,,,,1.0,2.0,,6.0,1.0,...,35606.0,2016.0,23624.0,543.69,,,60372320000000.0,1772,-0.146056,2017-06-30
3,14269464,1923117,,,,3.0,4.0,,,3.0,...,880456.0,2016.0,445569.0,9819.72,,,60590640000000.0,2028,0.021085,2017-06-01
4,11389003,2121349,,,,2.0,3.0,,6.0,2.0,...,614000.0,2016.0,449000.0,7673.19,,,60377030000000.0,3273,-0.325393,2017-06-01


In [3]:
df = prepare.prepare_zillow()
df.columns

Index(['parcelid', 'bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet',
       'fips', 'fullbathcnt', 'latitude', 'longitude', 'lotsizesquarefeet',
       'propertycountylandusecode', 'propertylandusetypeid',
       'rawcensustractandblock', 'regionidcity', 'regionidcounty',
       'regionidzip', 'roomcnt', 'yearbuilt', 'structuretaxvaluedollarcnt',
       'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt',
       'taxamount', 'censustractandblock'],
      dtype='object')

In [4]:
dfmvp = prepare.prepare_zillow_mvp()
dfmvp.head()

KeyError: "['transactiondate'] not found in axis"

In [None]:
help(wrangle)

In [None]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.wrangle_data(dfmvp, 'taxvaluedollarcnt', modeling=True)


In [None]:
train = pd.concat([X_train, y_train], axis = 1)
train.corr()

In [None]:
sns.heatmap(train.corr(), annot=True)

We'll do a correlation test on the highest correlations (bedroom count and calculated square footage)

* $H0$ - There is no relationship between taxvaluedollarcnt and bathroomcnt"
* $Ha$ - There is a relationship between taxvaluedollarcnt and bathroomcnt

In [None]:
from math import sqrt
from scipy import stats
a = .05
corr, p = stats.pearsonr(train['calculatedfinishedsquarefeet_scaled'], train['taxvaluedollarcnt'])
print(corr, p)
if p > a:
    print('We Fail to Reject H0')
else:
    print('We Reject H0a')

In [None]:
sns.regplot(data=train, x='calculatedfinishedsquarefeet_scaled', y = 'taxvaluedollarcnt')

In [None]:
corr, p = stats.pearsonr(train['bedroomcnt_scaled'], train['taxvaluedollarcnt'])
print(corr, p)
if p > a:
    print('We Fail to Reject H0')
else:
    print('We Reject H0')

In [None]:
sns.regplot(data=train, x='bathroomcnt_scaled', y = 'taxvaluedollarcnt')

# MVP v2
## Outlier Detection
To properly create a model we will need to determine if there are any outliers in the data and then address them. We will do that below using 

In [None]:
des = zillow[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'roomcnt', 'yearbuilt', 'landtaxvaluedollarcnt']].describe()
des

In [None]:
columns = list(des.columns)

outliers = []
for col in columns:
    IQR = (des[col].iloc[6] - des[col].iloc[4])
    Q3 = des[col].iloc[6]
    Q1 = des[col].iloc[4]
    little_outliers = Q1 - (1.5 * IQR)
    big_outliers = Q3 + (1.5 * IQR)
    outliers.append([IQR, Q3, Q1, little_outliers, big_outliers])

outliers = np.array(outliers)
df_outliers = pd.DataFrame(data=outliers, index = columns, columns = ['IQR', 'Q3', 'Q1', 'little_outliers', 'big_outliers'])
df_outliers

big_outliers = df_outliers[['big_outliers']].transpose()
big_outliers

In [None]:
little_outliers = df_outliers[['little_outliers']].transpose()
little_outliers

In [None]:
df['bathroomcnt_outlier'] = (df['bathroomcnt'] > 4.5) | (df['bathroomcnt'] < .05)
df['bedroomcnt_outlier'] = (df['bedroomcnt'] > 5.5) | (df['bedroomcnt'] < 1.5)
df['calculatedfinishedsquarefeet_outlier'] = (df['calculatedfinishedsquarefeet'] > 3633.0)
df['roomcnt_outlier'] = (df['roomcnt'] > 12.5)
df['yearbuilt_outlier'] = (df['yearbuilt'] > 2016.5) | (df['yearbuilt'] < 1908.5)
df['landtaxvaluedollarcnt_outlier'] = (df['landtaxvaluedollarcnt'] > 743335.5) 


In [None]:
print(f"bathroomcnt_outlier - {df['bathroomcnt_outlier'].sum()/df.shape[0] * 100}")
print(f"bedroomcnt_outlier - {df['bedroomcnt_outlier'].sum()/df.shape[0] * 100}")
print(f"calculatedfinishedsquarefeet_outlier - {df['calculatedfinishedsquarefeet_outlier'].sum()/df.shape[0] * 100}")
print(f"roomcnt_outlier - {df['roomcnt_outlier'].sum()/df.shape[0] * 100}")
print(f"yearbuilt_outlier - {df['yearbuilt_outlier'].sum()/df.shape[0] * 100}")
print(f"landtaxvaluedollarcnt_outlier - {df['landtaxvaluedollarcnt_outlier'].sum()/df.shape[0] * 100}")

In [None]:
df.shape

In [None]:
df.drop(df.loc[df['bathroomcnt_outlier']==True].index, inplace=True)
df.drop(df.loc[df['bedroomcnt_outlier']==True].index, inplace=True)
df.drop(df.loc[df['calculatedfinishedsquarefeet_outlier']==True].index, inplace=True)
df.drop(df.loc[df['roomcnt_outlier']==True].index, inplace=True)
df.drop(df.loc[df['yearbuilt_outlier']==True].index, inplace=True)
df.shape


# print(f"Percentage of rows removed is {(1 - 1978229/2139825) * 100}")

In [None]:
# Drop the outlier columns we created
df.drop(columns=['bathroomcnt_outlier', 'bedroomcnt_outlier', 'calculatedfinishedsquarefeet_outlier',
                 'roomcnt_outlier', 'yearbuilt_outlier', 'landtaxvaluedollarcnt_outlier'], inplace=True)

In [None]:
# Drop all the nan rows that are present (less than 1% of the total data)
df.dropna(inplace=True)

In [None]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.wrangle_data(df, 'taxvaluedollarcnt', modeling=False)
X_train.head()

In [None]:
df.shape

In [None]:
X_train.columns

In [None]:
# Drop columns we can't use for K best due to values

k_X_train = X_train.drop(columns=['parcelid', 'fips', 'latitude', 'longitude', 'propertylandusetypeid', 'rawcensustractandblock',
                     'regionidcity', 'regionidcounty', 'regionidzip', 'censustractandblock',
                     'structuretaxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt',
                     'taxamount', 'censustractandblock'])
k_X_validate = X_validate.drop(columns=['parcelid', 'fips', 'latitude', 'longitude', 'propertylandusetypeid', 'rawcensustractandblock',
                     'regionidcity', 'regionidcounty', 'regionidzip', 'censustractandblock',
                     'structuretaxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt',
                     'taxamount', 'censustractandblock'])
k_X_test = X_test.drop(columns=['parcelid', 'fips', 'latitude', 'longitude', 'propertylandusetypeid', 'rawcensustractandblock',
                     'regionidcity', 'regionidcounty', 'regionidzip', 'censustractandblock', 
                     'structuretaxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt',
                     'taxamount', 'censustractandblock'])

In [None]:
k_X_train.shape

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# Create the f_selector object 
f_selector = SelectKBest(f_regression, k=10)

# fit the selector to the training data
f_selector.fit(k_X_train, y_train)

# Reduce the training data by transforming the X_train data set to the f_selector
X_reduced = f_selector.transform(k_X_train)

# Use get support to create a mask
f_support = f_selector.get_support()

f_feature = k_X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)

In [None]:
wrangle.select_kbest(k_X_train, y_train, k_features=10)