In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import env
import csv

In [2]:
df = pd.read_csv('zillow.csv')

In [3]:
pd.set_option('display.max_columns', None) 


In [4]:
def remove_dup_col(df):
    df = df.loc[:,~df.columns.duplicated()]
    return df

In [5]:
df = remove_dup_col(df)

In [6]:
def handle_missing_values(df, prop_required_column = .9, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def data_prep(df, cols_to_remove=[], prop_required_column=.9, prop_required_row=.75):
    df.drop(columns = cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    return df

In [7]:
df = data_prep(df, cols_to_remove=[], prop_required_column=.9, prop_required_row=.75)

In [10]:
def drop_col(df):
        df = df.drop(columns = ['calculatedbathnbr', 'finishedsquarefeet12', 'fullbathcnt', 'propertycountylandusecode',
                            'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidzip', 
                            'censustractandblock', 'transactiondate', 'assessmentyear',
                            'roomcnt', 'regionidcounty'])
        return df

In [11]:
df = drop_col(df)
df.head()

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedfinishedsquarefeet,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertyzoningdesc,unitcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,logerror
0,12177905,2288172,3,4,8.0,2376.0,6037,2.0,34245180,-118240722,13038.0,LCR110000*,1,1970.0,108918.0,145143,36225,1777.51,-0.10341
1,10887214,1970746,3,3,8.0,1312.0,6037,2.0,34185120,-118414640,278581.0,LAR3,1,1964.0,73681.0,119407,45726,1533.89,0.00694
2,12095076,781532,3,4,9.0,2962.0,6037,2.0,34145202,-118179824,63000.0,PSR2,1,1950.0,276684.0,773303,496619,9516.26,-0.001011
3,12069064,870991,1,2,5.0,738.0,6037,,34149214,-118239357,4214.0,GLR4YY,1,1922.0,18890.0,218552,199662,2366.08,0.101723
4,12790562,1246926,3,4,9.0,3039.0,6037,2.0,33960230,-118006914,20028.0,WHRE20000*,1,1970.0,177527.0,220583,43056,3104.19,-0.040966


In [12]:
x = df['landtaxvaluedollarcnt']
y = df['lotsizesquarefeet']
ols_model = ols('lotsizesquarefeet ~ landtaxvaluedollarcnt', data=df).fit()

df['yhat'] = ols_model.predict(df[['landtaxvaluedollarcnt']])

In [13]:
df.lotsizesquarefeet = np.where(df.lotsizesquarefeet.isna(), df.yhat, df.lotsizesquarefeet)

In [14]:
def impute_values(df):
    sqfeet = df.calculatedfinishedsquarefeet.median()
    df.calculatedfinishedsquarefeet = df.calculatedfinishedsquarefeet.fillna(sqfeet)
    
    structuretaxvalue = df.structuretaxvaluedollarcnt.median()
    df.structuretaxvaluedollarcnt = df.structuretaxvaluedollarcnt.fillna(structuretaxvalue)
    
    taxvalue = df.taxvaluedollarcnt.median()
    df.taxvaluedollarcnt = df.taxvaluedollarcnt.fillna(taxvalue)
    
    landtaxvalue = df.landtaxvaluedollarcnt.median()
    df.landtaxvaluedollarcnt = df.landtaxvaluedollarcnt.fillna(landtaxvalue)
    
    return df

In [15]:
x = df['taxvaluedollarcnt']
y = df['taxamount']
ols_model = ols('lotsizesquarefeet ~ taxvaluedollarcnt', data=df).fit()

df['yhat'] = ols_model.predict(df[['taxvaluedollarcnt']])

In [16]:
# use y when we have y, if y is NaN, use yhat
df.taxamount = np.where(df.taxamount.isna(), df.yhat, df.taxamount)

In [17]:
df.fillna(value=pd.np.nan, inplace=True)
df = df.dropna()
def drop_col2(df): #Drop additional columns that are no longer of use. 
        df = df.drop(columns = ['taxamount', 'yhat'])
        return df

df = drop_col2(df)

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45631 entries, 0 to 46941
Data columns (total 18 columns):
parcelid                        45631 non-null int64
id                              45631 non-null int64
bathroomcnt                     45631 non-null int64
bedroomcnt                      45631 non-null int64
buildingqualitytypeid           45631 non-null float64
calculatedfinishedsquarefeet    45631 non-null int64
fips                            45631 non-null int64
heatingorsystemtypeid           45631 non-null float64
latitude                        45631 non-null int64
longitude                       45631 non-null int64
lotsizesquarefeet               45631 non-null int64
propertyzoningdesc              45631 non-null object
unitcnt                         45631 non-null int64
yearbuilt                       45631 non-null int64
structuretaxvaluedollarcnt      45631 non-null int64
taxvaluedollarcnt               45631 non-null int64
landtaxvaluedollarcnt           45631 

In [109]:
df[['bathroomcnt', 'calculatedfinishedsquarefeet', 'bedroomcnt','fips', 'latitude', 'longitude', 'lotsizesquarefeet', 'yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'landtaxvaluedollarcnt']] =df[['bathroomcnt', 'calculatedfinishedsquarefeet', 'bedroomcnt','fips', 'latitude', 'longitude', 'lotsizesquarefeet', 'yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'landtaxvaluedollarcnt']].astype('int64')

In [115]:
def get_outliers(df):
    arr = df[col]
    elements = numpy.array(arr)

    mean = numpy.mean(elements, axis=0)
    sd = numpy.std(elements, axis=0)
    result = [x for x in arr if (x > mean - 3 * sd)]
    result = [x for x in result if (x < mean + 3 * sd)]
    return result
def add_outlier_columns(df):
    outlier_cols = {col + '_outliers': get_outliers(df[col])
                   for col in df.select_dtypes('number')}
    return df.assign(**outlier_cols)
    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_outliers(df[col])
    return df


In [116]:
new_df = add_outlier_columns(df)

KeyError: 'logerror_outliers'

In [112]:
outlier_cols = [col for col in new_df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = new_df[col][new_df[col] > 0]
    print(data.describe())

~~~
parcelid_outliers
count    1.100000e+01
mean     1.497426e+08
std      5.724943e+03
min      1.497401e+08
25%      1.497404e+08
50%      1.497408e+08
75%      1.497413e+08
max      1.497597e+08
Name: parcelid_outliers, dtype: float64
~~~
id_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: id_outliers, dtype: float64
~~~
bathroomcnt_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: bathroomcnt_outliers, dtype: float64
~~~
bedroomcnt_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: bedroomcnt_outliers, dtype: float64
~~~
buildingqualitytypeid_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: buildingqualitytypeid_outliers, dtype: float64
~~~
calculatedfinishedsquarefeet_outliers
count    0.0
mean     NaN
std   

In [97]:
new_df = new_df[(new_df.logerror_outliers ==0) & (new_df.lotsizesquarefeet_outliers == 0)]

In [98]:
outlier_cols = [col for col in new_df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = new_df[col][new_df[col] > 0]
    print(data.describe())

~~~
parcelid_outliers
count    1.100000e+01
mean     1.497426e+08
std      5.724943e+03
min      1.497401e+08
25%      1.497404e+08
50%      1.497408e+08
75%      1.497413e+08
max      1.497597e+08
Name: parcelid_outliers, dtype: float64
~~~
id_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: id_outliers, dtype: float64
~~~
bathroomcnt_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: bathroomcnt_outliers, dtype: float64
~~~
bedroomcnt_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: bedroomcnt_outliers, dtype: float64
~~~
buildingqualitytypeid_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: buildingqualitytypeid_outliers, dtype: float64
~~~
calculatedfinishedsquarefeet_outliers
count    0.0
mean     NaN
std   

In [99]:
new_df = new_df[(new_df.bathroomcnt_outliers == 0) & (new_df.calculatedfinishedsquarefeet_outliers == 0) & (new_df.calculatedfinishedsquarefeet_outliers == 0)]
        
        
        
    

In [100]:
outlier_cols = [col for col in new_df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = new_df[col][new_df[col] > 0]
    print(data.describe())

~~~
parcelid_outliers
count    1.100000e+01
mean     1.497426e+08
std      5.724943e+03
min      1.497401e+08
25%      1.497404e+08
50%      1.497408e+08
75%      1.497413e+08
max      1.497597e+08
Name: parcelid_outliers, dtype: float64
~~~
id_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: id_outliers, dtype: float64
~~~
bathroomcnt_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: bathroomcnt_outliers, dtype: float64
~~~
bedroomcnt_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: bedroomcnt_outliers, dtype: float64
~~~
buildingqualitytypeid_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: buildingqualitytypeid_outliers, dtype: float64
~~~
calculatedfinishedsquarefeet_outliers
count    0.0
mean     NaN
std   

In [101]:
new_df = new_df[(new_df.structuretaxvaluedollarcnt_outliers == 0) & (new_df.taxvaluedollarcnt_outliers == 0) & (new_df.landtaxvaluedollarcnt_outliers == 0)]
        
        
        
    

In [102]:
new_df.describe()

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedfinishedsquarefeet,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,unitcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,logerror,parcelid_outliers,id_outliers,bathroomcnt_outliers,bedroomcnt_outliers,buildingqualitytypeid_outliers,calculatedfinishedsquarefeet_outliers,fips_outliers,heatingorsystemtypeid_outliers,latitude_outliers,longitude_outliers,lotsizesquarefeet_outliers,unitcnt_outliers,yearbuilt_outliers,structuretaxvaluedollarcnt_outliers,taxvaluedollarcnt_outliers,landtaxvaluedollarcnt_outliers,logerror_outliers
count,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0,38500.0
mean,11911020.0,1500122.0,2.169506,3.010156,6.477377,1654.542104,6037.0,3.642779,34109510.0,-118250100.0,12209.277766,1.0,1962.998286,158681.735558,399643.0,240961.3,2e-05,42783.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,2731404.0,860660.5,0.867487,0.93105,1.632956,702.27359,0.0,2.467115,223580.6,218033.7,12579.538707,0.0,23.814059,114140.896475,307721.2,232118.3,0.104401,2530782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,10711860.0,1406.0,1.0,1.0,1.0,152.0,6037.0,2.0,33339530.0,-118889200.0,736.0,1.0,1878.0,181.0,3254.0,161.0,-4.65542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11189220.0,764701.2,2.0,2.0,6.0,1165.0,6037.0,2.0,33954030.0,-118413200.0,5867.0,1.0,1949.0,80939.0,180374.0,62521.0,-0.028546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11886330.0,1501595.0,2.0,3.0,6.0,1491.0,6037.0,2.0,34091300.0,-118259200.0,7286.0,1.0,1958.0,130177.0,328919.0,179991.0,0.00466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,12522700.0,2246601.0,3.0,4.0,8.0,1964.0,6037.0,7.0,34193760.0,-118103700.0,11388.5,1.0,1981.0,203470.25,524884.2,333893.8,0.036078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,167656000.0,2982188.0,7.0,9.0,12.0,5271.0,6037.0,20.0,34755080.0,-117653900.0,72208.0,1.0,2016.0,717211.0,1912075.0,1425000.0,0.305291,149759700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
