In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats
from scipy.stats import shapiro
from scipy.stats import anderson
from statsmodels.formula.api import ols
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, StandardScaler
from statsmodels.graphics.gofplots import qqplot
from math import sqrt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


In [2]:
import env
df = pd.read_csv("zillow.csv")

In [3]:
pd.set_option('display.max_columns', None) 

In [4]:
def remove_dup_col(df):
    df = df.loc[:,~df.columns.duplicated()]
    return df
df = remove_dup_col(df)

In [5]:
def df2(df):
    num_rows_missing = df.isna().sum()
    pct_rows_missing = num_rows_missing/len(df)*100
    df_sum = pd.DataFrame()
    df_sum['num_rows_missing'] = num_rows_missing
    df_sum['pct_rows_missing'] = pct_rows_missing
    return df_sum

In [6]:
def handle_missing_values(df, prop_required_column = .9, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def data_prep(df, cols_to_remove=[], prop_required_column=.9, prop_required_row=.75):
    df.drop(columns = cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    return df
df = data_prep(df, cols_to_remove=[], prop_required_column=.9, prop_required_row=.75)

In [7]:
df.columns
df = df.drop(columns= ['calculatedbathnbr','finishedsquarefeet12', 'fips',
       'fullbathcnt', 'propertycountylandusecode', 'propertyzoningdesc', 'regionidcity', 'roomcnt', 'unitcnt', 'censustractandblock','transactiondate'] )

In [8]:
df['taxrate'] = df.taxamount/df.taxvaluedollarcnt
df['age'] = 2017 - df.yearbuilt
# pd.options.display.float_format = '{:.5f}'.format
# df.latitude = df.latitude.astype(float)/1000000
# # df.latitude.astype(float)/1000000
# df.longitude = df.longitude.astype(float)/1000000
# df.longitude
# df.head()

In [9]:
df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet
df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet

In [10]:
# create a reference table of the box coordinates

def box_coordinate_reference():
    data = {'box': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 
            'lat_max': [34.20554422, 34.04802211, 33.99908439, 33.80387687, 33.78696072, 33.69952829, 33.59712547, 
                        33.49972314, 34.38229689, 34.00526394, 34.06277594, 33.90355421, 33.80675031, 33.72661788, 
                        33.63681764], 
            'lat_min': [34.00187156, 33.97400195, 33.6995076, 33.69879119, 33.60475644, 33.60475644, 33.50224095, 
                        33.40473173, 34.08712558, 33.80359565, 33.89700744, 33.74602844, 33.64904627, 33.56876661, 
                        33.47880174], 
            'lon_max': [-119.1012469, -118.4853365, -118.3643017, -118.010889, -117.995256, -117.8587745, -117.7050516,
                        -117.595489, -117.1573461, -118.0072968, -116.7958861, -116.765738, -116.5865933, -116.4174318,
                        -116.2908127], 
            'lon_min': [-119.2668432, -118.5999006, -118.4982449, -118.5063201, -118.1009925, -118.1009925, -117.9472695, 
                        -117.837707, -118.9924736, -118.3552993, -118.0734651, -118.0401843, -117.8610396, -117.7330499, 
                        -117.6064309]}
    ref_df = pd.DataFrame(data)
    return ref_df

def find_box_id(df, ref_df):
#     box_coordinate_reference()
    df2 = df[['latitude', 'longitude', 'logerror', 'parcelid']]
    
    # divide lat and lon by 1,000,000
    df2['latitude'] = df2.latitude / 1e6
    df2['longitude'] = df2.longitude / 1e6
    
    # set lat and lon to be dual indices
    df2 = df2.sort_values(['latitude', 'longitude']).set_index(['latitude', 'longitude'])

    # create an empty dataframe that all the box_ids will be appended to as we loop through the box coordinates 
    # to identify each parcel
    box_df = pd.DataFrame(columns = ['latitude', 'longitude', 'logerror', 'parcelid', 'box_id'])

    for i, row in ref_df.iterrows():
        box = df2.query('@row.lat_min <= latitude <= @row.lat_max and @row.lon_min <= longitude <= @row.lon_max')
        box['box_id'] = i + 1
        box = box.reset_index()
        box_df = box_df.append(box)
        
    # set index to be parcel id for joining the 2 tables. \
    df = df.set_index('parcelid')
    box_df = box_df.set_index('parcelid')
    box_df = box_df.drop(columns=['latitude', 'longitude', 'logerror'])

    # join the box id's to the original dataframe
    return df.join(box_df).reset_index()



In [11]:
box_coordinate_reference()
df = find_box_id(df, box_coordinate_reference())
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48285 entries, 0 to 48284
Data columns (total 26 columns):
parcelid                        48285 non-null int64
id                              48285 non-null int64
bathroomcnt                     48285 non-null int64
bedroomcnt                      48285 non-null int64
buildingqualitytypeid           47854 non-null float64
calculatedfinishedsquarefeet    48282 non-null float64
heatingorsystemtypeid           47644 non-null float64
latitude                        48285 non-null int64
longitude                       48285 non-null int64
lotsizesquarefeet               47043 non-null float64
propertylandusetypeid           48285 non-null int64
rawcensustractandblock          48285 non-null float64
regionidcounty                  48285 non-null int64
regionidzip                     48265 non-null float64
yearbuilt                       48276 non-null float64
structuretaxvaluedollarcnt      48222 non-null float64
taxvaluedollarcnt          

In [12]:
df.box_id

0          9
1          9
2          9
3          9
4          9
5          9
6          9
7          9
8          9
9          9
10         9
11         9
12         9
13         9
14         9
15         9
16         9
17         9
18         9
19         9
20         9
21         9
22         9
23         9
24         9
25         9
26         9
27         9
28         9
29         9
        ... 
48255     11
48256     11
48257     11
48258     11
48259      5
48260      6
48261     12
48262     12
48263     12
48264      7
48265      9
48266      9
48267    NaN
48268    NaN
48269    NaN
48270    NaN
48271    NaN
48272    NaN
48273    NaN
48274      3
48275      3
48276      3
48277      3
48278      9
48279      9
48280      3
48281      9
48282     11
48283      3
48284      9
Name: box_id, Length: 48285, dtype: object