# Acquire

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from math import sqrt
from scipy import stats

from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.feature_selection import SelectKBest, f_regression, RFE

import os

from env import host, user, password

In [None]:
def get_connection(db, user=user, host=host, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

From https://help.rentingwell.com/article/multi-unit-vs-single-unit/:

> A **multi-unit property** is a rental property that has been divided into multiple units that are rented independently to different tenants. A duplex, a triplex, or an apartment building would all be multi-unit properties.

> A **single-unit property** is a rental property that is rented as a single entity. A condo, a townhouse, or a vacation rental would typically be single-unit properties.

We only want to predict on single-unit properties, so I will select only single-unit properties from the Zillow data in SQL:
 * **261 -** Single Family Residential
 * **262 -** Rural Residence
 * **263 -** Mobile Home
 * **264 -** Townhouse
 * **268 -** Row House
 * **273 -** Bungalow
 * **274 -** Zero Lot Line
 * **275 -** Manufactured, Modular, Prefabricated Homes
 * **276 -** Patio Home
 * **279 -** Inferred Single Family Residential

> **De-bug:** Why is my SQL query returning blank?

In [None]:
# De-bug: Why is this returning blank?

sql_query = '''
            SELECT parcelid, id, bathroomcnt, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, fips, latitude, longitude, regionidcounty, roomcnt, yearbuilt, taxvaluedollarcnt, assessmentyear, transactiondate, propertycountylandusecode, propertylandusetypeid
            FROM properties_2017
            JOIN predictions_2017 USING (parcelid, parcelid)
            WHERE transactiondate BETWEEN '2017-05-01' AND '2017-06-30'
            AND propertylandusetypeid = '261' OR '262' OR '263' OR '264' OR '268' OR '273' OR '274' OR '275' OR '276' OR '279';
            '''
df = pd.read_sql(sql_query, get_connection('zillow'))
#df.to_csv('zillow_df.csv')
df.head(3)

In [None]:
SELECT t.title AS Title, 
       COUNT(t.title) AS Count
FROM titles AS t
JOIN dept_emp AS de ON de.emp_no = t.emp_no 
    AND de.to_date > CURDATE() 
    AND t.to_date > CURDATE()
JOIN departments AS d USING(dept_no)
GROUP BY t.title;

In [None]:
df.shape

In [None]:
df.isnull().sum()

# parcelid, id, bathroomcnt, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, fips, latitude, longitude, regionidcounty, roomcnt, yearbuilt, taxvaluedollarcnt, assessmentyear, transactiondate

I'm going to go back to my SQL query and edit it to select only the columns that I want. I won't select columns that contain mostly null values.

calculatedbathnbr has 55 null values, so I am going to recreate that variable on the next interation of my MVP with a combination of bathroomcnt and bedroomcnt. For now, I will leave it and impute the mean.

If I have time in a future iteration I might use longitude and latitude with Geopy.

Although regionidzip has only 10 missing values, I'm choosing to leave it out for now because I don't think imputing a median will give useful or near-accurate values, and for location I can use county, as that's what we've been asked to compare anyway.

I'm also not selecting columns that have similar or duplicate information.

In [None]:
#From Austin's code:

SELECT p17.*, pt.propertylandusetypeid, pred17.logerror, pred17.transactiondate
                FROM properties_2017 AS p17
                JOIN propertylandusetype as pt ON p17.propertylandusetypeid = pt.propertylandusetypeid
                JOIN predictions_2017 AS pred17 ON p17.parcelid = pred17.parcelid
                WHERE pred17.transactiondate BETWEEN '2017-05-01' AND '2017-06-30'
                AND pt.propertylandusetypeid <> 31 AND pt.propertylandusetypeid <> 46
                AND pt.propertylandusetypeid <> 47 AND pt.propertylandusetypeid <> 267
                AND pt.propertylandusetypeid <> 269 AND pt.propertylandusetypeid <> 270
                AND pt.propertylandusetypeid <> 271 AND pt.propertylandusetypeid <> 274
                AND pt.propertylandusetypeid <> 273 AND pt.propertylandusetypeid <> 290
                AND pt.propertylandusetypeid <> 291