In [32]:
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import env
import os
pd.set_option('display.max_columns', None)

In [33]:
# function to establish connection to MySQL workbench to retrieve data.
def get_connection(db, user=env.username, host=env.host, password=env.password):
    return f'mysql+pymysql://{env.username}:{env.password}@{env.host}/{db}'

In [34]:
def new_zillow_data():
    sql_querry= '''
            select prop.*,
            pred.logerror,
            pred.transactiondate,
            air.airconditioningdesc,
            arch.architecturalstyledesc,
            build.buildingclassdesc,
            heat.heatingorsystemdesc,
            land.propertylandusedesc,
            story.storydesc,
            type.typeconstructiondesc
            from properties_2017 as prop
            Join predictions_2017 as pred using(parcelid)
            LEFT JOIN airconditioningtype as air USING(airconditioningtypeid)
            LEFT JOIN architecturalstyletype as arch USING(architecturalstyletypeid)
            LEFT JOIN buildingclasstype as build USING(buildingclasstypeid)
            LEFT JOIN heatingorsystemtype as heat USING(heatingorsystemtypeid)
            LEFT JOIN propertylandusetype as land USING(propertylandusetypeid)
            LEFT JOIN storytype as story USING(storytypeid)
            LEFT JOIN typeconstructiontype as type USING(typeconstructiontypeid)
            WHERE propertylandusedesc = "Single Family Residential"
            AND transactiondate < '2018'
            AND prop.longitude IS NOT NULL
            AND prop.latitude IS NOT NULL
                  
            '''
    
    
    df= pd.read_sql(sql_querry, get_connection('zillow'))
    df= df.drop_duplicates(subset=['parcelid'],keep='last')
    return df

    

In [35]:
def zillow_data():
    '''this function returns the zillow data and creates the csv file in local directory  if it doesnot exist already.'''
    filename= "zillow.csv"
    if os.path.isfile(filename):
        return pd.read_csv(filename)
    else:
        # read data from database in dataframe
        df= new_zillow_data()
        #cache data
        df.to_csv(filename)
        return df

In [36]:
df= zillow_data()
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,,,3100.0,3100.0,,,,,6059.0,,3.0,2.0,633.0,,,33634931.0,-117869207.0,4506.0,,,,,,122,261.0,,60590630.0,53571.0,1286.0,,96978.0,0.0,,1.0,,,,,1998.0,,,485713.0,1023282.0,2016.0,537569.0,11013.72,,,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,,1465.0,1465.0,1465.0,,,1465.0,,6111.0,1.0,1.0,1.0,0.0,,,34449266.0,-119281531.0,12647.0,,,,,,1110,261.0,,61110010.0,13091.0,2061.0,,97099.0,5.0,,,,,,,1967.0,1.0,,88000.0,464000.0,2016.0,376000.0,5672.48,,,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,,,1243.0,1243.0,,,,,6059.0,,2.0,2.0,440.0,,,33886168.0,-117823170.0,8432.0,1.0,,,,1.0,122,261.0,,60590220.0,21412.0,1286.0,,97078.0,6.0,,,,,,,1962.0,1.0,,85289.0,564778.0,2016.0,479489.0,6488.3,,,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,,,2376.0,2376.0,,,,,6037.0,,3.0,,,,2.0,34245180.0,-118240722.0,13038.0,1.0,,,,1.0,101,261.0,LCR110000*,60373000.0,396551.0,3101.0,,96330.0,0.0,,,,1.0,,,1970.0,,,108918.0,145143.0,2016.0,36225.0,1777.51,,,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,781532,12095076,1.0,,,3.0,4.0,,9.0,3.0,,,2962.0,2962.0,,,,,6037.0,,3.0,,,,2.0,34145202.0,-118179824.0,63000.0,1.0,,,,1.0,101,261.0,PSR2,60374610.0,47019.0,3101.0,274684.0,96293.0,0.0,,,,1.0,,,1950.0,,,276684.0,773303.0,2016.0,496619.0,9516.26,,,60374610000000.0,-0.001011,2017-01-01,Central,,,Central,Single Family Residential,,


In [37]:
# shape of the zillow dataframe
df.shape

(52320, 68)

# Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)



In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52320 entries, 0 to 52440
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            52320 non-null  int64  
 1   parcelid                      52320 non-null  int64  
 2   airconditioningtypeid         13615 non-null  float64
 3   architecturalstyletypeid      70 non-null     float64
 4   basementsqft                  47 non-null     float64
 5   bathroomcnt                   52320 non-null  float64
 6   bedroomcnt                    52320 non-null  float64
 7   buildingclasstypeid           0 non-null      object 
 8   buildingqualitytypeid         33655 non-null  float64
 9   calculatedbathnbr             52185 non-null  float64
 10  decktypeid                    389 non-null    float64
 11  finishedfloor1squarefeet      4371 non-null   float64
 12  calculatedfinishedsquarefeet  52239 non-null  float64
 13  f

In [40]:
# summary stats
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,52320.0,1496906.0,859433.0,349.0,757613.5,1500134.0,2241333.0,2982270.0
parcelid,52320.0,12996820.0,3350919.0,10711860.0,11510180.0,12578290.0,14130360.0,167687800.0
airconditioningtypeid,13615.0,2.439589,3.847925,1.0,1.0,1.0,1.0,13.0
architecturalstyletypeid,70.0,7.1,2.66567,2.0,7.0,7.0,7.0,21.0
basementsqft,47.0,678.9787,711.8252,38.0,263.5,512.0,809.5,3560.0
bathroomcnt,52320.0,2.30001,1.022807,0.0,2.0,2.0,3.0,18.0
bedroomcnt,52320.0,3.300765,0.9475551,0.0,3.0,3.0,4.0,14.0
buildingqualitytypeid,33655.0,6.264894,1.716346,1.0,5.0,6.0,8.0,12.0
calculatedbathnbr,52185.0,2.30527,1.018067,1.0,2.0,2.0,3.0,18.0
decktypeid,389.0,66.0,0.0,66.0,66.0,66.0,66.0,66.0


In [41]:
# value counts for each colummn in the dataframe
for col in df.columns:
    print(col)
    print(df[col].value_counts())
         
        

id
1727539    1
2127263    1
590202     1
1019191    1
2163005    1
          ..
2923683    1
876441     1
1183906    1
2159363    1
1187175    1
Name: id, Length: 52320, dtype: int64
parcelid
14297519    1
12895331    1
12680821    1
11839030    1
17148095    1
           ..
17294679    1
14138000    1
12026029    1
14212835    1
12826780    1
Name: parcelid, Length: 52320, dtype: int64
airconditioningtypeid
1.0     11873
13.0     1567
5.0       159
11.0       16
Name: airconditioningtypeid, dtype: int64
architecturalstyletypeid
7.0     62
3.0      3
2.0      2
21.0     2
8.0      1
Name: architecturalstyletypeid, dtype: int64
basementsqft
900.0     2
640.0     2
100.0     2
515.0     2
273.0     2
912.0     2
314.0     1
819.0     1
1809.0    1
604.0     1
126.0     1
588.0     1
786.0     1
669.0     1
800.0     1
396.0     1
112.0     1
645.0     1
384.0     1
1969.0    1
252.0     1
600.0     1
1218.0    1
280.0     1
300.0     1
224.0     1
512.0     1
380.0     1
204.0     1
200

Name: airconditioningdesc, dtype: int64
architecturalstyledesc
Contemporary     62
Cape Cod          3
Bungalow          2
Ranch/Rambler     2
Conventional      1
Name: architecturalstyledesc, dtype: int64
buildingclassdesc
Series([], Name: buildingclassdesc, dtype: int64)
heatingorsystemdesc
Central       20689
Floor/Wall    12527
Forced air      517
Solar            85
None             16
Baseboard         7
Radiant           6
Gravity           2
Yes               1
Name: heatingorsystemdesc, dtype: int64
propertylandusedesc
Single Family Residential    52320
Name: propertylandusedesc, dtype: int64
storydesc
Basement    47
Name: storydesc, dtype: int64
typeconstructiondesc
Frame       75
Concrete     1
Name: typeconstructiondesc, dtype: int64


In [42]:
# nulls by columns
df.isnull().head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,False,False,True,True,True,False,False,True,True,False,True,True,False,False,True,True,True,True,False,True,False,False,False,True,True,False,False,False,True,True,True,True,True,False,False,True,False,False,False,True,False,False,True,False,True,True,True,True,False,True,True,False,False,False,False,False,True,True,False,False,False,True,True,True,True,False,True,True
1,False,False,True,True,True,False,False,True,True,False,True,False,False,False,True,True,False,True,False,False,False,False,False,True,True,False,False,False,True,True,True,True,True,False,False,True,False,False,False,True,False,False,True,True,True,True,True,True,False,False,True,False,False,False,False,False,True,True,False,False,False,True,True,True,True,False,True,True
2,False,False,True,True,True,False,False,True,True,False,True,True,False,False,True,True,True,True,False,True,False,False,False,True,True,False,False,False,False,True,True,True,False,False,False,True,False,False,False,True,False,False,True,True,True,True,True,True,False,False,True,False,False,False,False,False,True,True,False,False,False,True,True,True,True,False,True,True
3,False,False,True,True,True,False,False,True,False,False,True,True,False,False,True,True,True,True,False,True,False,True,True,True,False,False,False,False,False,True,True,True,False,False,False,False,False,False,False,True,False,False,True,True,True,False,True,True,False,True,True,False,False,False,False,False,True,True,False,False,False,True,True,True,False,False,True,True
4,False,False,False,True,True,False,False,True,False,False,True,True,False,False,True,True,True,True,False,True,False,True,True,True,False,False,False,False,False,True,True,True,False,False,False,False,False,False,False,False,False,False,True,True,True,False,True,True,False,True,True,False,False,False,False,False,True,True,False,False,False,False,True,True,False,False,True,True


In [43]:
null_col = pd.DataFrame({'num_rows_missing': df.isnull().sum(),
                        'pct_rows_missing': (df.isnull().sum()/ df.shape[0])})
null_col.head()

Unnamed: 0,num_rows_missing,pct_rows_missing
id,0,0.0
parcelid,0,0.0
airconditioningtypeid,38705,0.739774
architecturalstyletypeid,52250,0.998662
basementsqft,52273,0.999102


# Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [44]:
def null_counter(df):
    num_missing= df.isnull().sum()
    rows= df.shape[0]
    pct_missing = num_missing / rows
    cols_missing = pd.DataFrame({'num_rows_missing':num_missing,'pct_rows_missing':pct_missing})
                                
    return cols_missing.sort_values(by='num_rows_missing',ascending=False)

In [47]:
null_counter(df)

Unnamed: 0,num_rows_missing,pct_rows_missing
buildingclassdesc,52320,1.000000
buildingclasstypeid,52320,1.000000
finishedsquarefeet15,52320,1.000000
finishedsquarefeet13,52320,1.000000
storytypeid,52273,0.999102
...,...,...
latitude,0,0.000000
fips,0,0.000000
bedroomcnt,0,0.000000
bathroomcnt,0,0.000000


In [48]:
# function to drop columns/rows based on proportion of nulls in dtaframe
def null_annihilaton(df,prop_required_column, prop_required_row):
    
    prop_null_column = 1 - prop_required_column
    
    for col in list(df.columns):
        
        null_sum = df[col].isna().sum()
        null_pct = null_sum / df.shape[0]
        
        if null_pct > prop_null_column:
            df.drop(columns=col, inplace=True)
            
    row_threshold = int(prop_required_row * df.shape[1])
    
    df.dropna(axis=0, thresh=row_threshold, inplace=True)
    
    return df
    

In [49]:
df_dropped = null_annihilaton(df, .80,.80)
df_dropped.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,propertylandusedesc
0,1727539,14297519,3.5,4.0,3.5,3100.0,3100.0,6059.0,3.0,33634931.0,-117869207.0,4506.0,122,261.0,60590630.0,53571.0,1286.0,96978.0,0.0,1998.0,485713.0,1023282.0,2016.0,537569.0,11013.72,60590630000000.0,0.025595,2017-01-01,Single Family Residential
1,1387261,17052889,1.0,2.0,1.0,1465.0,1465.0,6111.0,1.0,34449266.0,-119281531.0,12647.0,1110,261.0,61110010.0,13091.0,2061.0,97099.0,5.0,1967.0,88000.0,464000.0,2016.0,376000.0,5672.48,61110010000000.0,0.055619,2017-01-01,Single Family Residential
2,11677,14186244,2.0,3.0,2.0,1243.0,1243.0,6059.0,2.0,33886168.0,-117823170.0,8432.0,122,261.0,60590220.0,21412.0,1286.0,97078.0,6.0,1962.0,85289.0,564778.0,2016.0,479489.0,6488.3,60590220000000.0,0.005383,2017-01-01,Single Family Residential
3,2288172,12177905,3.0,4.0,3.0,2376.0,2376.0,6037.0,3.0,34245180.0,-118240722.0,13038.0,101,261.0,60373000.0,396551.0,3101.0,96330.0,0.0,1970.0,108918.0,145143.0,2016.0,36225.0,1777.51,60373000000000.0,-0.10341,2017-01-01,Single Family Residential
4,781532,12095076,3.0,4.0,3.0,2962.0,2962.0,6037.0,3.0,34145202.0,-118179824.0,63000.0,101,261.0,60374610.0,47019.0,3101.0,96293.0,0.0,1950.0,276684.0,773303.0,2016.0,496619.0,9516.26,60374610000000.0,-0.001011,2017-01-01,Single Family Residential


In [51]:
df_dropped.shape

(52312, 29)

# Mall  customers

In [None]:
# ACQUIRE mall data from server
sql= '''Select * from customers'''


In [None]:
df= pd.read_sql(sql,get_connection('mall_customers'))
df.head()

In [None]:
#summarize
df.describe()

In [None]:
# check nulls
df.isna().sum()

### Outliers using IQR

In [None]:
# age quantiles
q1, q3 = df['age'].quantile([.25,.75])
q1, q3

In [None]:
# calculating IQR
age_iqr = q3-q1
age_iqr

In [None]:
# upperbound and lower bound with k =1.5
upper = q3 +(age_iqr * 1.5)
lower = q1 - (age_iqr * 1.5)

upper, lower

In [None]:
# confirming age outliers has been handled
df[df.age> upper]

In [None]:
# splitting the data into train, test and validate
seed = 123

train, test_val = train_test_split(df, train_size=0.7,
                                   random_state=seed)

test, val = train_test_split(test_val, train_size=0.5,
                                       random_state=seed)

train.shape, val.shape, test.shape

In [None]:
# encoding categorical column using pd.get_dummies
train =pd.get_dummies(train,drop_first= True)
train.head()

In [None]:
train.info()

In [None]:
# scale the data. columns to scale age and annual income using minmaxsclaer
scaler= MinMaxScaler()
train[['age','annual_income']]= scaler.fit_transform(train[['age','annual_income']])
train.head()
