# Acquire

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from scipy import stats
import sklearn.preprocessing


from env import user, password, host
import wrangle

In [2]:
def get_db_url(database):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

In [3]:
url = get_db_url('zillow')

In [4]:
df = pd.read_sql('''
SELECT  propertylandusetypeid
        bedroomcnt,
        bathroomcnt,
        calculatedfinishedsquarefeet,
        taxvaluedollarcnt,
        yearbuilt,
        taxamount,
        fips
FROM properties_2017 
JOIN propertylandusetype USING (propertylandusetypeid)
JOIN predictions_2017 ON properties_2017.id = predictions_2017.id
WHERE propertylandusetype.propertylandusedesc = "Single Family Residential" 
AND predictions_2017.transactiondate LIKE "2017%%"
''', url)

In [5]:
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,261.0,3.0,2883.0,537944.0,1957.0,6689.31,6037.0
1,261.0,2.0,1922.0,405551.0,1960.0,5007.01,6037.0
2,261.0,2.0,1696.0,294146.0,1960.0,3661.14,6037.0
3,261.0,2.0,1696.0,252549.0,1960.0,3161.39,6037.0
4,261.0,2.0,1470.0,512000.0,1957.0,6242.87,6037.0


# Clean/Prep

In [6]:
df.shape

(56079, 7)

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bedroomcnt,56079.0,261.0,0.0,261.0,261.0,261.0,261.0,261.0
bathroomcnt,56079.0,2.239457,1.007012,0.0,2.0,2.0,3.0,20.0
calculatedfinishedsquarefeet,55848.0,1868.285704,966.973076,20.0,1260.0,1630.0,2217.0,26345.0
taxvaluedollarcnt,56071.0,457643.446951,663558.094993,9.0,187026.0,329525.0,533484.5,67506489.0
yearbuilt,55830.0,1961.186262,21.995171,1862.0,1949.0,1958.0,1976.0,2016.0
taxamount,55949.0,5589.36349,8541.810318,8.3,2523.76,4130.48,6410.56,1195295.62
fips,56079.0,6048.841581,21.23761,6037.0,6037.0,6037.0,6059.0,6111.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56079 entries, 0 to 56078
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bedroomcnt                    56079 non-null  float64
 1   bathroomcnt                   56079 non-null  float64
 2   calculatedfinishedsquarefeet  55848 non-null  float64
 3   taxvaluedollarcnt             56071 non-null  float64
 4   yearbuilt                     55830 non-null  float64
 5   taxamount                     55949 non-null  float64
 6   fips                          56079 non-null  float64
dtypes: float64(7)
memory usage: 3.0 MB


In [9]:
df.isnull().sum()

bedroomcnt                        0
bathroomcnt                       0
calculatedfinishedsquarefeet    231
taxvaluedollarcnt                 8
yearbuilt                       249
taxamount                       130
fips                              0
dtype: int64

In [10]:
# What's the percentage of nulls?
df.isna().mean()

bedroomcnt                      0.000000
bathroomcnt                     0.000000
calculatedfinishedsquarefeet    0.004119
taxvaluedollarcnt               0.000143
yearbuilt                       0.004440
taxamount                       0.002318
fips                            0.000000
dtype: float64

In [11]:
# If we drop all rows containing any null, how much data do we have left?
round(df.dropna().shape[0] / df.shape[0], 4)

0.9939

In [12]:
for column in df.columns:
    print(column)
    print(df[column].value_counts())
    print("--------------------")

bedroomcnt
261.0    56079
Name: bedroomcnt, dtype: int64
--------------------
bathroomcnt
2.0     24535
3.0     11152
1.0     10630
2.5      3723
4.0      2147
1.5       793
5.0       780
3.5       736
4.5       506
0.0       366
6.0       303
5.5       165
7.0       116
8.0        37
6.5        30
9.0        26
10.0       11
7.5        11
8.5         5
11.0        3
13.0        1
9.5         1
20.0        1
19.5        1
Name: bathroomcnt, dtype: int64
--------------------
calculatedfinishedsquarefeet
1200.0    136
1080.0    113
1120.0    105
1400.0     99
1040.0     99
         ... 
7439.0      1
4365.0      1
5836.0      1
5685.0      1
3351.0      1
Name: calculatedfinishedsquarefeet, Length: 4684, dtype: int64
--------------------
taxvaluedollarcnt
460000.0     24
440000.0     22
450000.0     22
425000.0     22
570000.0     21
             ..
699275.0      1
2500000.0     1
878607.0      1
144510.0      1
374696.0      1
Name: taxvaluedollarcnt, Length: 43893, dtype: int64
-------

**Takeaways So Far**
- Bedrooms can be an integer
- Bathrooms can stay a float to keep the .5 bathrooms
- Year built, FIPS, and taxvaluedollarcnt can be converted to integers w/o data loss

In [13]:
# Dropping nulls due to relative insignificance to overall dataset
df = df.dropna()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55735 entries, 0 to 56078
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bedroomcnt                    55735 non-null  float64
 1   bathroomcnt                   55735 non-null  float64
 2   calculatedfinishedsquarefeet  55735 non-null  float64
 3   taxvaluedollarcnt             55735 non-null  float64
 4   yearbuilt                     55735 non-null  float64
 5   taxamount                     55735 non-null  float64
 6   fips                          55735 non-null  float64
dtypes: float64(7)
memory usage: 3.4 MB


In [15]:
# 100% of calculatedfinishedsquarefeet can be converted to int w/o data loss
(df.calculatedfinishedsquarefeet == df.calculatedfinishedsquarefeet.astype(int)).mean()

1.0

In [16]:
# 100% of calculatedfinishedsquarefeet can lose the decimal and be OK
(df.taxvaluedollarcnt == df.taxvaluedollarcnt.astype(int)).mean()

1.0

In [17]:
(df.bathroomcnt == df.bathroomcnt.astype(int)).mean()

0.8930653987619988

In [18]:
# fips, yearbuilt, and bedrooms can be integers
df["fips"] = df["fips"].astype(int)
df["yearbuilt"] = df["yearbuilt"].astype(int)
df["bedroomcnt"] = df["bedroomcnt"].astype(int)
df["taxvaluedollarcnt"] = df["taxvaluedollarcnt"].astype(int)
df["calculatedfinishedsquarefeet"] = df["calculatedfinishedsquarefeet"].astype(int)

In [19]:
df.dtypes

bedroomcnt                        int64
bathroomcnt                     float64
calculatedfinishedsquarefeet      int64
taxvaluedollarcnt                 int64
yearbuilt                         int64
taxamount                       float64
fips                              int64
dtype: object

In [20]:
df.drop(columns = 'taxamount')

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,fips
0,261,3.0,2883,537944,1957,6037
1,261,2.0,1922,405551,1960,6037
2,261,2.0,1696,294146,1960,6037
3,261,2.0,1696,252549,1960,6037
4,261,2.0,1470,512000,1957,6037
...,...,...,...,...,...,...
56074,261,2.5,2212,233515,1980,6111
56075,261,3.0,2279,479342,1980,6111
56076,261,2.5,1891,379680,1984,6111
56077,261,2.0,1815,365491,1981,6111


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55735 entries, 0 to 56078
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bedroomcnt                    55735 non-null  int64  
 1   bathroomcnt                   55735 non-null  float64
 2   calculatedfinishedsquarefeet  55735 non-null  int64  
 3   taxvaluedollarcnt             55735 non-null  int64  
 4   yearbuilt                     55735 non-null  int64  
 5   taxamount                     55735 non-null  float64
 6   fips                          55735 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 3.4 MB


In [22]:
# How many different categories are in fips?
df['fips'].nunique()

3

In [23]:
df.fips.value_counts()

6037    37099
6059    13901
6111     4735
Name: fips, dtype: int64

fips is akin to county code:
- 6037 = LA county
- 6059 = Orange county
- 6111 = Ventura county

In [24]:
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,261,3.0,2883,537944,1957,6689.31,6037
1,261,2.0,1922,405551,1960,5007.01,6037
2,261,2.0,1696,294146,1960,3661.14,6037
3,261,2.0,1696,252549,1960,3161.39,6037
4,261,2.0,1470,512000,1957,6242.87,6037


In [25]:
df = df.replace(to_replace={'6037':'LA County', '6059':'Orange County', '6111':'Ventura County'}, value=none)

NameError: name 'none' is not defined

In [None]:
df.rename(columns = {'bedroomcnt':'bedrooms', 'bathroomcnt':'bathrooms', 'calculatedfinishedsquarefeet':'sq_ft', 'taxvaluedollarcnt':'property_value', 'yearbuilt':'year_built', 'fips':'location'})

In [None]:
def wrangle_zillow():
    
    import env
    '''
    Read student_grades csv file into a pandas DataFrame,
    drop student_id column, replace whitespaces with NaN values,
    drop any rows with Null values, convert all columns to int64,
    return cleaned student grades DataFrame.
    '''
    # Acquire data from SQL.
    def get_db_url(dbname, username=env.user, hostname=env.host, passw=env.password):
        url = f'mysql+pymysql://{username}:{passw}@{hostname}/zillow'
        return url

    url = get_db_url('zillow', env.user, env.host, env.password)
    
    df = pd.read_sql('''
        SELECT  propertylandusetypeid
                bedroomcnt,
                bathroomcnt,
                calculatedfinishedsquarefeet,
                taxvaluedollarcnt,
                yearbuilt,
                taxamount,
                fips
        FROM properties_2017 as p17
        WHERE propertylandusetypeid = '261'
        ''', url)

    ## Clean data, dropping rows and converting dtypes ##
    
    # Drop all rows with NaN values.
    df = df.dropna()

    # Converting fips, yearbuilt, and bedrooms, taxvaluedollarcnt, and calculatedfinishedsquarefeet into integers
    df["fips"] = df["fips"].astype(int)
    df["yearbuilt"] = df["yearbuilt"].astype(int)
    df["bedroomcnt"] = df["bedroomcnt"].astype(int)
    df["taxvaluedollarcnt"] = df["taxvaluedollarcnt"].astype(int)
    df["calculatedfinishedsquarefeet"] = df["calculatedfinishedsquarefeet"].astype(int)
    
    # Manually handle outliers that do not represent properties likely for 99% of buyers and zillow visitors 
    df = df[df.bathroomcnt <= 6]
    
    df = df[df.bedroomcnt <= 6]

    df = df[df.taxvaluedollarcnt < 2_000_000]
    
    return df

In [None]:
df = wrangle_zillow()

In [None]:
df.info()  

In [None]:
df.head()

In [None]:
df.shape

# Exploration / Exploratory Analysis / Statistical Testing

# Modeling

# Modeling Evaluation