In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')

from sklearn.linear_model import LinearRegression



import acquire_zillow
import prepare_zillow

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = prepare_zillow.prep_zill()

In [3]:
df.shape

(91822, 61)

In [4]:
df.head()

Unnamed: 0,id,parcelid,logerror,transactiondate,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,threequarterbathnbr,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,2931667,11016594,0.0276,2016-01-01,,2.0,3.0,4,2.0,,,1684.0,1684.0,,,,,6037,,2.0,,,,34280990.0,-118488536.0,7528.0,,,,,,0100,LARS,60371100.0,12447,3101,31817.0,96370,0.0,,1.0,,,1959.0,,,122754.0,360170.0,2015,237416.0,6735.88,,,60371100000000.0,Central,,,Central,Single Family Residential,,
2,2444575,12098116,-0.004,2016-01-01,,3.0,2.0,4,3.0,,,2217.0,2217.0,,,,,6037,,3.0,,,,34136312.0,-118175032.0,11423.0,,,,,,0100,PSR6,60374600.0,47019,3101,275411.0,96293,0.0,,1.0,,,1940.0,,,61994.0,119906.0,2015,57912.0,11484.48,,,60374600000000.0,Central,,,Central,Single Family Residential,,
3,1584730,12643413,0.0218,2016-01-02,,2.0,2.0,4,2.0,,,839.0,839.0,,,,,6037,,2.0,,,,33755800.0,-118309000.0,70859.0,,,,,,010C,LAR3,60373000.0,12447,3101,54300.0,96222,0.0,,1.0,,,1987.0,,,171518.0,244880.0,2015,73362.0,3048.74,,,60373000000000.0,Central,,,Central,Condominium,,
5,2843942,11509835,-0.2705,2016-01-02,,4.0,4.0,1,4.0,,,3067.0,3067.0,,,,,6037,,4.0,,,,33870089.0,-118402768.0,2708.0,,,,,,0100,HBR3YY,60376200.0,29712,3101,,96109,0.0,,1.0,,,1982.0,,,880650.0,2447951.0,2015,1567301.0,27126.57,,,60376200000000.0,Central,,,Central,Single Family Residential,,
6,1628001,12286022,0.044,2016-01-02,,1.0,2.0,7,1.0,,,1297.0,1297.0,,,,,6037,,1.0,,,,33899475.0,-118212720.0,6677.0,,,,,,0100,CORH*,60375400.0,24174,3101,,96091,0.0,,1.0,,,1939.0,,,64549.0,111521.0,2015,46972.0,2304.97,,,60375400000000.0,,,,Floor/Wall,Single Family Residential,,


In [5]:
df = prepare_zillow.impute_landsqft(df)

In [6]:
list_to_fill = ['poolcnt']

df = prepare_zillow.fill_na_with_0(df, list_to_fill)

In [7]:
df = prepare_zillow.drop_null_percent_column(df)

In [None]:
prepare_zillow.column_missing(df)

In [8]:
df.isnull().sum()

id                                 0
parcelid                           0
logerror                           0
transactiondate                    0
bathroomcnt                        0
bedroomcnt                         0
buildingqualitytypeid            763
calculatedbathnbr                  0
calculatedfinishedsquarefeet       0
finishedsquarefeet12               3
fips                               0
fullbathcnt                        0
latitude                           0
longitude                          0
lotsizesquarefeet                  0
poolcnt                            0
propertycountylandusecode          0
propertyzoningdesc               345
rawcensustractandblock             0
regionidcity                    1972
regionidcounty                     0
regionidzip                       17
roomcnt                            0
unitcnt                            0
yearbuilt                         18
structuretaxvaluedollarcnt       105
taxvaluedollarcnt                  0
a

In [9]:
# Info in these columns are still missing a lot and seem useless

df = df.drop(columns=['id', 
                      'transactiondate',
                      'buildingqualitytypeid', 
                      'propertyzoningdesc', 
                      'regionidcity', 
                      'censustractandblock', 
                      'heatingorsystemdesc',
                      'propertycountylandusecode',
                      'rawcensustractandblock',
                      'assessmentyear'])


In [10]:
# Only a few NaN left, dropping.

df.dropna(inplace=True)

In [11]:
# bathroomcnt, calculatedbathnbr and fullbathcnt are all the same numbers
# calculatedfinishedsquarefeet and finishedsquarefeet12 same

df = df.drop(columns=['calculatedbathnbr', 'fullbathcnt', 'finishedsquarefeet12'])


In [12]:
df.shape

(91678, 20)

In [13]:
df.dtypes

parcelid                         object
logerror                        float64
bathroomcnt                     float64
bedroomcnt                      float64
calculatedfinishedsquarefeet    float64
fips                             object
latitude                        float64
longitude                       float64
lotsizesquarefeet               float64
poolcnt                         float64
regionidcounty                   object
regionidzip                      object
roomcnt                         float64
unitcnt                         float64
yearbuilt                       float64
structuretaxvaluedollarcnt      float64
taxvaluedollarcnt               float64
landtaxvaluedollarcnt           float64
taxamount                       float64
propertylandusedesc              object
dtype: object

In [14]:
df.describe()

Unnamed: 0,logerror,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,poolcnt,roomcnt,unitcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount
count,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0,91678.0
mean,0.012706,2.267191,3.028862,1730.548747,34111780.0,-118259600.0,38005.44,0.242665,0.000491,1.0,1964.117673,183540.6,463944.8,280404.2,6077.63819
std,0.159005,0.967215,0.944017,887.940442,220932.8,217914.3,146472.8,0.428696,0.058242,0.0,23.941918,233125.1,649116.8,468834.7,7889.95637
min,-4.65542,1.0,1.0,800.0,33339530.0,-118908200.0,-9386.551,0.0,0.0,1.0,1878.0,100.0,10958.0,161.0,120.84
25%,-0.0284,2.0,2.0,1176.0,33965130.0,-118425000.0,6105.0,0.0,0.0,1.0,1950.0,84600.0,183868.2,62500.0,2732.73
50%,0.005904,2.0,3.0,1493.0,34092090.0,-118284400.0,7757.0,0.0,0.0,1.0,1961.0,133431.0,325350.0,175288.0,4414.52
75%,0.0397,3.0,4.0,1988.0,34196040.0,-118110100.0,18531.0,0.0,0.0,1.0,1983.0,207663.2,529987.0,336481.0,6821.8325
max,5.262999,15.0,16.0,21929.0,34818770.0,-117652100.0,6971010.0,1.0,9.0,1.0,2016.0,9948100.0,27750000.0,24500000.0,321936.09


In [None]:
def iqr_outliers(s, k):
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    high = s > (q3 + k * iqr)
    low = s < (q1 - k * iqr)

    return low.sum(), high.sum()


In [None]:
iqr_outliers(df.logerror, 1.5)

In [None]:
def stdev_outliers(s, k):
    s_desc = s.describe()
    mean = s_desc[1]
    std = s_desc[2]
    high = s > (mean + (std * k))
    low = s < (mean - (std * k))
    return low.sum(), high.sum()


In [None]:
stdev_outliers(df.logerror, 2)

In [None]:
def percent_outliers(s, k):
    k = k/100
    low_k = 0 + k
    high_k = 1 - k
    low = s < s.quantile(low_k)
    high = s > s.quantile(high_k)
    return low.sum(), high.sum()


In [None]:
percent_outliers(df.logerror, 1)

In [None]:
def find_outliers(s, f, k):
    if f == 'iqr':
        return iqr_outliers(s,k)
    elif f == 'stdev':
        return stdev_outliers(s,k)
    elif f == 'percent':
        return percent_outliers(s, k)
    

In [None]:
find_outliers(df.logerror, 'iqr', 2)

In [None]:
df.dtypes

In [16]:
df.shape

(91678, 20)