In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')

from sklearn.linear_model import LinearRegression



import acquire_zillow
import prepare_zillow

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df = prepare_zillow.prep_zill()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df = prepare_zillow.impute_landsqft(df)

In [None]:
list_to_fill = ['poolcnt']

df = prepare_zillow.fill_na_with_0(df, list_to_fill)

In [None]:
df = prepare_zillow.drop_null_percent_column(df)

In [None]:
prepare_zillow.column_missing(df)

In [None]:
df.isnull().sum()

In [None]:
# Info in these columns are still missing a lot and seem useless

df = df.drop(columns=['id', 
                      'transactiondate',
                      'buildingqualitytypeid', 
                      'propertyzoningdesc', 
                      'regionidcity', 
                      'censustractandblock', 
                      'heatingorsystemdesc',
                      'propertycountylandusecode',
                      'rawcensustractandblock',
                      'assessmentyear'])


In [None]:
# Only a few NaN left, dropping.

df.dropna(inplace=True)

In [None]:
# bathroomcnt, calculatedbathnbr and fullbathcnt are all the same numbers
# calculatedfinishedsquarefeet and finishedsquarefeet12 same

df = df.drop(columns=['calculatedbathnbr', 'fullbathcnt', 'finishedsquarefeet12'])


In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
def iqr_outliers(s, k):
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    high = s > (q3 + k * iqr)
    low = s < (q1 - k * iqr)

    return low.sum(), high.sum()


In [None]:
iqr_outliers(df.logerror, 1.5)

In [None]:
def stdev_outliers(s, k):
    s_desc = s.describe()
    mean = s_desc[1]
    std = s_desc[2]
    high = s > (mean + (std * k))
    low = s < (mean - (std * k))
    return low.sum(), high.sum()


In [None]:
stdev_outliers(df.logerror, 2)

In [None]:
def percent_outliers(s, k):
    k = k/100
    low_k = 0 + k
    high_k = 1 - k
    low = s < s.quantile(low_k)
    high = s > s.quantile(high_k)
    return low.sum(), high.sum()


In [None]:
percent_outliers(df.logerror, 1)

In [None]:
def find_outliers(s, f, k):
    if f == 'iqr':
        return iqr_outliers(s,k)
    elif f == 'stdev':
        return stdev_outliers(s,k)
    elif f == 'percent':
        return percent_outliers(s, k)
    

In [None]:
find_outliers(df.logerror, 'iqr', 2)

In [None]:
df.dtypes

In [None]:
df.shape