In [1]:
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import acquire_PJ
import wrangle_PJ
import acquire

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
zillow = acquire.get_zillow_data()

In [3]:
# Get a peak of the dataframe
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77574 entries, 0 to 77573
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77574 non-null  int64  
 1   parcelid                      77574 non-null  int64  
 2   airconditioningtypeid         25006 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77574 non-null  float64
 6   bedroomcnt                    77574 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49808 non-null  float64
 9   calculatedbathnbr             76959 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6035 non-null   float64
 12  calculatedfinishedsquarefeet  77374 non-null  float64
 13  f

In [4]:
# Create a function that will remove rows and columns that have missing values past a certain threshold.
def handle_missing_values(df, p_row = 0.84, p_col = 0.84):
    ''' function which takes in a dataframe, required notnull proportions of non-null rows and columns.
    drop the columns and rows columns based on theshold:'''
    
    #drop columns with nulls
    threshold = int(p_col * len(df.index)) # Require that many non-NA values.
    df.dropna(axis = 1, thresh = threshold, inplace = True)
    
    #drop rows with nulls
    threshold = int(p_row * len(df.columns)) # Require that many non-NA values.
    df.dropna(axis = 0, thresh = threshold, inplace = True)
    
    
    return df

In [5]:
b = handle_missing_values(zillow)

In [6]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77474 entries, 0 to 77573
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77474 non-null  int64  
 1   parcelid                      77474 non-null  int64  
 2   bathroomcnt                   77474 non-null  float64
 3   bedroomcnt                    77474 non-null  float64
 4   calculatedbathnbr             76959 non-null  float64
 5   calculatedfinishedsquarefeet  77373 non-null  float64
 6   finishedsquarefeet12          73919 non-null  float64
 7   fips                          77474 non-null  float64
 8   fullbathcnt                   76959 non-null  float64
 9   latitude                      77474 non-null  float64
 10  longitude                     77474 non-null  float64
 11  lotsizesquarefeet             69300 non-null  float64
 12  propertycountylandusecode     77474 non-null  object 
 13  p

In [7]:
# Take a look at the amount of missing values.
b.isnull().sum()

id                                 0
parcelid                           0
bathroomcnt                        0
bedroomcnt                         0
calculatedbathnbr                515
calculatedfinishedsquarefeet     101
finishedsquarefeet12            3555
fips                               0
fullbathcnt                      515
latitude                           0
longitude                          0
lotsizesquarefeet               8174
propertycountylandusecode          0
propertylandusetypeid              0
rawcensustractandblock             0
regionidcity                    1460
regionidcounty                     0
regionidzip                       45
roomcnt                            0
yearbuilt                        169
structuretaxvaluedollarcnt        99
taxvaluedollarcnt                  1
assessmentyear                     0
landtaxvaluedollarcnt              1
taxamount                          5
censustractandblock              226
logerror                           0
t

In [8]:
# Create a list of columns to drop.
columns_to_drop = ['calculatedbathnbr','calculatedfinishedsquarefeet','finishedsquarefeet12','fullbathcnt','propertycountylandusecode','propertylandusetypeid','rawcensustractandblock','regionidcity','regionidcounty','regionidzip','structuretaxvaluedollarcnt','censustractandblock','propertylandusedesc']

In [9]:
def drop_columns(df, drop_col):
    df = df.drop(columns=drop_col)
    return df

In [10]:
b = drop_columns(b, columns_to_drop)

In [11]:
b.isna().sum()

id                          0
parcelid                    0
bathroomcnt                 0
bedroomcnt                  0
fips                        0
latitude                    0
longitude                   0
lotsizesquarefeet        8174
roomcnt                     0
yearbuilt                 169
taxvaluedollarcnt           1
assessmentyear              0
landtaxvaluedollarcnt       1
taxamount                   5
logerror                    0
transactiondate             0
dtype: int64

In [12]:
for col in b.columns:
    if b[col].isna().sum() > 0:
        b[col] = b[col].fillna(value = b[col].mean())
        print(b[col])

0          4506.0
1         12647.0
2          8432.0
3         13038.0
4        278581.0
           ...   
77569     59487.0
77570     47405.0
77571     12105.0
77572      5074.0
77573      6347.0
Name: lotsizesquarefeet, Length: 77474, dtype: float64
0        1998.0
1        1967.0
2        1962.0
3        1970.0
4        1964.0
          ...  
77569    1980.0
77570    1940.0
77571    1964.0
77572    1954.0
77573    1955.0
Name: yearbuilt, Length: 77474, dtype: float64
0        1023282.0
1         464000.0
2         564778.0
3         145143.0
4         119407.0
           ...    
77569     379000.0
77570     354621.0
77571      67205.0
77572      49546.0
77573     522000.0
Name: taxvaluedollarcnt, Length: 77474, dtype: float64
0        537569.0
1        376000.0
2        479489.0
3         36225.0
4         45726.0
           ...   
77569    114000.0
77570    283704.0
77571     16522.0
77572     16749.0
77573    382000.0
Name: landtaxvaluedollarcnt, Length: 77474, dtype: float64
0  

In [13]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77474 entries, 0 to 77573
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     77474 non-null  int64  
 1   parcelid               77474 non-null  int64  
 2   bathroomcnt            77474 non-null  float64
 3   bedroomcnt             77474 non-null  float64
 4   fips                   77474 non-null  float64
 5   latitude               77474 non-null  float64
 6   longitude              77474 non-null  float64
 7   lotsizesquarefeet      77474 non-null  float64
 8   roomcnt                77474 non-null  float64
 9   yearbuilt              77474 non-null  float64
 10  taxvaluedollarcnt      77474 non-null  float64
 11  assessmentyear         77474 non-null  float64
 12  landtaxvaluedollarcnt  77474 non-null  float64
 13  taxamount              77474 non-null  float64
 14  logerror               77474 non-null  float64
 15  tr

In [15]:
# Drop two more columns.
b = b.drop(columns=['id','parcelid'])

In [16]:
# Convert the fips column into dummy variables that hold the names of the three different counties.
def get_counties(df):
    # create dummy vars of fips id
    county_df = pd.get_dummies(df.fips)
    # rename columns by actual county name
    county_df.columns = ['LA', 'Orange', 'Ventura']
    # concatenate the dataframe with the 3 county columns to the original dataframe
    df_dummies = pd.concat([df, county_df], axis = 1)
    # drop regionidcounty and fips columns
    df = df_dummies.drop(columns = ['fips'])
    return df

In [19]:
b = get_counties(b)

In [40]:
# Remove observations where bedroom and bathroom counts are zero.
b = b[b.bedroomcnt != 0]
b = b[b.bathroomcnt !=0]

In [44]:
b

Unnamed: 0,bathroomcnt,bedroomcnt,latitude,longitude,lotsizesquarefeet,roomcnt,yearbuilt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,logerror,transactiondate,LA,Orange,Ventura
0,3.5,4.0,33634931.0,-117869207.0,4506.0,0.0,1998.0,1023282.0,2016.0,537569.0,11013.72,0.025595,2017-01-01,0,1,0
1,1.0,2.0,34449266.0,-119281531.0,12647.0,5.0,1967.0,464000.0,2016.0,376000.0,5672.48,0.055619,2017-01-01,0,0,1
2,2.0,3.0,33886168.0,-117823170.0,8432.0,6.0,1962.0,564778.0,2016.0,479489.0,6488.30,0.005383,2017-01-01,0,1,0
3,3.0,4.0,34245180.0,-118240722.0,13038.0,0.0,1970.0,145143.0,2016.0,36225.0,1777.51,-0.103410,2017-01-01,1,0,0
4,3.0,3.0,34185120.0,-118414640.0,278581.0,0.0,1964.0,119407.0,2016.0,45726.0,1533.89,0.006940,2017-01-01,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77569,3.0,3.0,34202400.0,-118502000.0,59487.0,0.0,1980.0,379000.0,2016.0,114000.0,4685.34,-0.002245,2017-09-20,1,0,0
77570,2.0,2.0,34245368.0,-118282383.0,47405.0,0.0,1940.0,354621.0,2016.0,283704.0,4478.43,0.020615,2017-09-20,1,0,0
77571,2.0,4.0,34300140.0,-118706327.0,12105.0,7.0,1964.0,67205.0,2016.0,16522.0,1107.48,0.013209,2017-09-21,0,0,1
77572,1.0,3.0,34040895.0,-118038169.0,5074.0,0.0,1954.0,49546.0,2016.0,16749.0,876.43,0.037129,2017-09-21,1,0,0
