# Acquire

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from math import sqrt
from scipy import stats

from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.feature_selection import SelectKBest, f_regression, RFE

import os

from env import host, user, password

In [2]:
def get_connection(db, user=user, host=host, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

From https://help.rentingwell.com/article/multi-unit-vs-single-unit/:

> A **multi-unit property** is a rental property that has been divided into multiple units that are rented independently to different tenants. A duplex, a triplex, or an apartment building would all be multi-unit properties.

> A **single-unit property** is a rental property that is rented as a single entity. A condo, a townhouse, or a vacation rental would typically be single-unit properties.

We only want to predict on single-unit properties, so I will select only single-unit properties from the Zillow data in SQL:
 * **261 -** Single Family Residential
 * **262 -** Rural Residence
 * **263 -** Mobile Home
 * **264 -** Townhouse
 * **268 -** Row House
 * **273 -** Bungalow
 * **274 -** Zero Lot Line
 * **275 -** Manufactured, Modular, Prefabricated Homes
 * **276 -** Patio Home
 * **279 -** Inferred Single Family Residential

In [8]:
# De-bug: Why is this returning blank?

sql_query = '''
            SELECT parcelid, id, bathroomcnt, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, fips, latitude, longitude, regionidcounty, roomcnt, yearbuilt, taxvaluedollarcnt, assessmentyear, transactiondate, propertycountylandusecode, propertylandusetypeid
            FROM properties_2017
            JOIN predictions_2017 USING (parcelid, id)
            WHERE transactiondate BETWEEN '2017-05-01' AND '2017-06-30'
            AND propertylandusetypeid IN ('261', '262', '263', '264', '268', '273', '274', '275', '276', '279');
            '''
df = pd.read_sql(sql_query, get_connection('zillow'))
#df.to_csv('zillow_df.csv')
df.head(3)

Unnamed: 0,parcelid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fips,latitude,longitude,regionidcounty,roomcnt,yearbuilt,taxvaluedollarcnt,assessmentyear,transactiondate,propertycountylandusecode,propertylandusetypeid


In [4]:
df.shape

(0, 17)

In [5]:
df.isnull().sum()

# parcelid, id, bathroomcnt, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, fips, latitude, longitude, regionidcounty, roomcnt, yearbuilt, taxvaluedollarcnt, assessmentyear, transactiondate

parcelid                        0
id                              0
bathroomcnt                     0
bedroomcnt                      0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
fips                            0
latitude                        0
longitude                       0
regionidcounty                  0
roomcnt                         0
yearbuilt                       0
taxvaluedollarcnt               0
assessmentyear                  0
transactiondate                 0
propertycountylandusecode       0
propertylandusetypeid           0
dtype: int64

I'm going to go back to my SQL query and edit it to select only the columns that I want. I won't select columns that contain mostly null values.

calculatedbathnbr has 55 null values, so I am going to recreate that variable on the next interation of my MVP with a combination of bathroomcnt and bedroomcnt. For now, I will leave it and impute the mean.

If I have time in a future iteration I will use longitude and latitude with Geopy.

Although regionidzip has only 10 missing values, I'm choosing to leave it out because I don't think impute a median will give useful or near-accurate values.

I'm also not selecting columns that have similar or duplicate information.