In [1]:
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import acquire_PJ
import wrangle_PJ
import acquire

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
b = acquire.get_zillow_data()

In [3]:
# Get a peak of the dataframe
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77574 entries, 0 to 77573
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77574 non-null  int64  
 1   parcelid                      77574 non-null  int64  
 2   airconditioningtypeid         25006 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77574 non-null  float64
 6   bedroomcnt                    77574 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49808 non-null  float64
 9   calculatedbathnbr             76959 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6035 non-null   float64
 12  calculatedfinishedsquarefeet  77374 non-null  float64
 13  f

In [4]:
# Create a function that will remove rows and columns that have missing values past a certain threshold.
def handle_missing_values(df, p_row = 0.84, p_col = 0.84):
    ''' function which takes in a dataframe, required notnull proportions of non-null rows and columns.
    drop the columns and rows columns based on theshold:'''
    
    #drop columns with nulls
    threshold = int(p_col * len(df.index)) # Require that many non-NA values.
    df.dropna(axis = 1, thresh = threshold, inplace = True)
    
    #drop rows with nulls
    threshold = int(p_row * len(df.columns)) # Require that many non-NA values.
    df.dropna(axis = 0, thresh = threshold, inplace = True)
    
    
    return df

In [5]:
b = handle_missing_values(b)

In [6]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77474 entries, 0 to 77573
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77474 non-null  int64  
 1   parcelid                      77474 non-null  int64  
 2   bathroomcnt                   77474 non-null  float64
 3   bedroomcnt                    77474 non-null  float64
 4   calculatedbathnbr             76959 non-null  float64
 5   calculatedfinishedsquarefeet  77373 non-null  float64
 6   finishedsquarefeet12          73919 non-null  float64
 7   fips                          77474 non-null  float64
 8   fullbathcnt                   76959 non-null  float64
 9   latitude                      77474 non-null  float64
 10  longitude                     77474 non-null  float64
 11  lotsizesquarefeet             69300 non-null  float64
 12  propertycountylandusecode     77474 non-null  object 
 13  p

In [7]:
# Take a look at the amount of missing values.
b.isnull().sum()

id                                 0
parcelid                           0
bathroomcnt                        0
bedroomcnt                         0
calculatedbathnbr                515
calculatedfinishedsquarefeet     101
finishedsquarefeet12            3555
fips                               0
fullbathcnt                      515
latitude                           0
longitude                          0
lotsizesquarefeet               8174
propertycountylandusecode          0
propertylandusetypeid              0
rawcensustractandblock             0
regionidcity                    1460
regionidcounty                     0
regionidzip                       45
roomcnt                            0
yearbuilt                        169
structuretaxvaluedollarcnt        99
taxvaluedollarcnt                  1
assessmentyear                     0
landtaxvaluedollarcnt              1
taxamount                          5
censustractandblock              226
logerror                           0
t

In [8]:
# Create a list of columns to drop.
columns_to_drop = ['id','parcelid','calculatedbathnbr','finishedsquarefeet12','fullbathcnt','propertycountylandusecode','propertylandusetypeid','rawcensustractandblock','regionidcity','regionidcounty','regionidzip','structuretaxvaluedollarcnt','censustractandblock','propertylandusedesc']

In [9]:
def drop_columns(df, drop_col):
    df = df.drop(columns=drop_col)
    return df

In [10]:
b = drop_columns(b, columns_to_drop)

In [11]:
b.isna().sum()

id                          0
parcelid                    0
bathroomcnt                 0
bedroomcnt                  0
fips                        0
latitude                    0
longitude                   0
lotsizesquarefeet        8174
roomcnt                     0
yearbuilt                 169
taxvaluedollarcnt           1
assessmentyear              0
landtaxvaluedollarcnt       1
taxamount                   5
logerror                    0
transactiondate             0
dtype: int64

In [None]:
for col in b.columns:
    if b[col].isna().sum() > 0:
        b[col] = b[col].fillna(value = b[col].mean())
        print(b[col])

In [12]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77474 entries, 0 to 77573
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     77474 non-null  int64  
 1   parcelid               77474 non-null  int64  
 2   bathroomcnt            77474 non-null  float64
 3   bedroomcnt             77474 non-null  float64
 4   fips                   77474 non-null  float64
 5   latitude               77474 non-null  float64
 6   longitude              77474 non-null  float64
 7   lotsizesquarefeet      69300 non-null  float64
 8   roomcnt                77474 non-null  float64
 9   yearbuilt              77305 non-null  float64
 10  taxvaluedollarcnt      77473 non-null  float64
 11  assessmentyear         77474 non-null  float64
 12  landtaxvaluedollarcnt  77473 non-null  float64
 13  taxamount              77469 non-null  float64
 14  logerror               77474 non-null  float64
 15  tr