In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import scipy.stats as stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.impute
import os

from env import host, user, password
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, dbscan

In [None]:
# Creating a string that connects me to MySQLWorkbench

def get_connection(db, user=user, host=host, password=password):
    '''
    get_connection uses login info from env.py file to access Codeup db.
    It takes in a string name of a database as an argument.
    '''
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [None]:
def get_zillow_data():
    '''
    zillow_data() gets the zillow (only properties_2017 table) data from Codeup db, then writes it to a csv file,
    and returns the DF.
    '''
    # Creating a SQL query
    sql_query = '''
                SELECT DISTINCT
      properties_2017.parcelid,
      bathroomcnt,
      bedroomcnt,
      calculatedfinishedsquarefeet,
      fips,
      latitude,
      longitude,  
      lotsizesquarefeet,
      yearbuilt, 
      structuretaxvaluedollarcnt,
      taxvaluedollarcnt, 
      landtaxvaluedollarcnt,
      taxamount,                      
      predictions_2017.logerror,                       
      predictions_2017.transactiondate
   FROM properties_2017
   JOIN predictions_2017 USING(parcelid)
   LEFT JOIN airconditioningtype USING(airconditioningtypeid)
   LEFT JOIN architecturalstyletype USING(architecturalstyletypeid)
   LEFT JOIN buildingclasstype USING(buildingclasstypeid)
   LEFT JOIN heatingorsystemtype USING(heatingorsystemtypeid)
   LEFT JOIN propertylandusetype USING(propertylandusetypeid)
   LEFT JOIN storytype USING(storytypeid)
   LEFT JOIN typeconstructiontype USING(typeconstructiontypeid)
   WHERE
      latitude IS NOT NULL
      AND longitude IS NOT NULL
      AND transactiondate BETWEEN '2017-01-01' AND '2017-12-31';
                '''
    
    # Reading in the DataFrame from Codeup db.
    df = pd.read_sql(sql_query, get_connection('zillow'))
    return df

def get_local_zillow():
    '''
    get_local_zillow reads in telco data from Codeup database, writes data to
    a csv file if a local file does not exist, and returns a DF.
    '''
    if os.path.isfile('houses.csv'):
        
        # If csv file exists read in data from csv file.
        df = pd.read_csv('houses.csv', index_col=0)
        
    else:
        
        # Read fresh data from db into a DataFrame
        df = get_zillow_data()
        
        # Cache data
        df.to_csv('houses.csv')
        
    return df

In [None]:
houses = get_zillow_data()
houses.head()

In [None]:
houses.shape

In [5]:
help(sb)

Help on package seaborn:

NAME
    seaborn - # Import seaborn objects

PACKAGE CONTENTS
    _core
    _decorators
    _docstrings
    _statistics
    _testing
    algorithms
    axisgrid
    categorical
    cm
    colors (package)
    conftest
    distributions
    external (package)
    matrix
    miscplot
    palettes
    rcmod
    regression
    relational
    tests (package)
    utils
    widgets

DATA
    crayons = {'Almond': '#EFDECD', 'Antique Brass': '#CD9575', 'Apricot':...
    xkcd_rgb = {'acid green': '#8ffe09', 'adobe': '#bd6c48', 'algae': '#54...

VERSION
    0.11.2

FILE
    /opt/homebrew/anaconda3/lib/python3.9/site-packages/seaborn/__init__.py




In [10]:
df = sb.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [34]:
def summarize(df):
    print('Shape: {}'.format(df.shape), '\n\n##########\n\n')
    print('Info')
    df.info()
    print('\n\n##########\n\n')
    print('Statistical description\n')
    print(df.describe(include='all').T)

In [35]:
summarize(df)

Shape: (244, 7) 

##########


Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


##########


Statistical description

            count unique     top freq       mean       std   min      25%  \
total_bill  244.0    NaN     NaN  NaN  19.785943  8.902412  3.07  13.3475   
tip         244.0    NaN     NaN  NaN   2.998279  1.383638   1.0      2.0   
sex           244      2    Male  157        NaN       NaN   NaN      NaN   
smoker        244      2      No  151        NaN       NaN   NaN      N