### 1) Acquire data from mySQL using the python module to connect and query. You will want to end with a single dataframe. Make sure to include: the logerror, all fields related to the properties that are available. You will end up using all the tables in the database.

- Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.

- Only include properties with a transaction in 2017, and include only the last transaction for each properity (so no duplicate property ID's), along with zestimate error and date of transaction.

- Only include properties that include a latitude and longitude value.

In [68]:
# imports
from env import host, user, password

import pandas as pd
import numpy as np
import os
import sklearn

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [3]:
# creates sql string for connection to data science database
def get_connection(db, user=user, host=host, password=password):
    """
    Function creates a URL that can be used to connect to the data science database.
    """
    # return string to access database
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

# retrieves data for exercises
def all_2017_zillow_data():
    '''
    This function retrieves data from the zillow codeup data science database and returns it as a dataframe.
    '''
    # create sql string to join tables in sql and only keep rows where both latitude and longitude are not null
    sql_string = '''
                select * from properties_2017
                join predictions_2017 using (parcelid)
                left join airconditioningtype using (airconditioningtypeid)
                left join architecturalstyletype using (architecturalstyletypeid)
                left join buildingclasstype using (buildingclasstypeid)
                left join heatingorsystemtype using (heatingorsystemtypeid)
                left join propertylandusetype using (propertylandusetypeid)
                left join storytype using (storytypeid)
                left join typeconstructiontype using (typeconstructiontypeid)
                left join unique_properties using (parcelid)
                where latitude is not null and longitude is not null;
                '''
    # convert data into data frame
    df = pd.read_sql(sql_string, get_connection('zillow'))
    # return data frame
    return df

In [25]:
# using function to create DF
df = all_2017_zillow_data()

# previewing data
df

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,261.0,,,,,1727539,,...,0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,261.0,,,,,1387261,,...,1,0.055619,2017-01-01,,,,,Single Family Residential,,
2,14186244,,,261.0,,,,,11677,,...,2,0.005383,2017-01-01,,,,,Single Family Residential,,
3,12177905,,,261.0,2.0,,,,2288172,,...,3,-0.103410,2017-01-01,,,,Central,Single Family Residential,,
4,10887214,,,266.0,2.0,,,1.0,1970746,,...,4,0.006940,2017-01-01,Central,,,Central,Condominium,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77575,11000655,,,261.0,2.0,,,,673515,,...,77609,0.020615,2017-09-20,,,,Central,Single Family Residential,,
77576,17239384,,,261.0,,,,,2968375,,...,77610,0.013209,2017-09-21,,,,,Single Family Residential,,
77577,12773139,,,261.0,2.0,,,1.0,1843709,,...,77611,0.037129,2017-09-21,Central,,,Central,Single Family Residential,,
77578,12826780,,,261.0,2.0,,,,1187175,,...,77612,0.007204,2017-09-25,,,,Central,Single Family Residential,,


### 2) Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)

In [4]:
# summary stats
df.describe()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,id.1,logerror
count,77580.0,223.0,50.0,77580.0,49572.0,15.0,207.0,25007.0,77580.0,50.0,...,172.0,77465.0,77579.0,77580.0,77578.0,77575.0,2900.0,77333.0,77580.0,77580.0
mean,13008280.0,6.040359,7.0,261.824465,3.921811,3.933333,7.386473,1.812013,1495404.0,679.72,...,1.0,189279.6,490147.6,2016.0,301150.0,5995.927626,14.088276,60496670000000.0,38806.723795,0.0168
std,3519376.0,0.556035,0.0,5.141564,3.59477,0.258199,2.72803,2.965768,860970.0,689.703546,...,0.0,230409.5,653794.2,0.0,492721.9,7628.81649,2.181281,1533329000000.0,22403.756329,0.170739
min,10711860.0,4.0,7.0,31.0,1.0,3.0,2.0,1.0,349.0,38.0,...,1.0,44.0,1000.0,2016.0,161.0,19.92,3.0,60371010000000.0,0.0,-4.65542
25%,11538200.0,6.0,7.0,261.0,2.0,4.0,7.0,1.0,752143.0,273.0,...,1.0,84171.0,206899.0,2016.0,85293.25,2712.65,14.0,60373110000000.0,19404.75,-0.02431
50%,12530560.0,6.0,7.0,261.0,2.0,4.0,7.0,1.0,1498256.0,515.0,...,1.0,136402.0,358878.0,2016.0,203181.0,4448.23,15.0,60376030000000.0,38804.5,0.006675
75%,14211350.0,6.0,7.0,266.0,7.0,4.0,7.0,1.0,2240950.0,796.5,...,1.0,218734.0,569000.0,2016.0,366739.8,6926.885,15.0,60590420000000.0,58208.25,0.039291
max,167689300.0,13.0,7.0,275.0,24.0,4.0,21.0,13.0,2982274.0,3560.0,...,1.0,11421790.0,49061240.0,2016.0,48952200.0,586639.3,99.0,483030100000000.0,77613.0,5.262999


In [5]:
# non-null value counts, good way to see roughly how many rows have missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77580 entries, 0 to 77579
Data columns (total 69 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      77580 non-null  int64  
 1   typeconstructiontypeid        223 non-null    float64
 2   storytypeid                   50 non-null     float64
 3   propertylandusetypeid         77580 non-null  float64
 4   heatingorsystemtypeid         49572 non-null  float64
 5   buildingclasstypeid           15 non-null     float64
 6   architecturalstyletypeid      207 non-null    float64
 7   airconditioningtypeid         25007 non-null  float64
 8   id                            77580 non-null  int64  
 9   basementsqft                  50 non-null     float64
 10  bathroomcnt                   77580 non-null  float64
 11  bedroomcnt                    77580 non-null  float64
 12  buildingqualitytypeid         49810 non-null  float64
 13  c

In [7]:
# examining data types
df.dtypes

parcelid                    int64
typeconstructiontypeid    float64
storytypeid               float64
propertylandusetypeid     float64
heatingorsystemtypeid     float64
                           ...   
buildingclassdesc          object
heatingorsystemdesc        object
propertylandusedesc        object
storydesc                  object
typeconstructiondesc       object
Length: 69, dtype: object

In [9]:
# number of rows and columns of our df
df.shape

(77580, 69)

In [17]:
# shows how many unique values are in each column
df.nunique()

parcelid                  77381
typeconstructiontypeid        4
storytypeid                   1
propertylandusetypeid        13
heatingorsystemtypeid        10
                          ...  
buildingclassdesc             2
heatingorsystemdesc          10
propertylandusedesc          13
storydesc                     1
typeconstructiondesc          4
Length: 69, dtype: int64

### 3) Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [30]:
# function creates df which holds each column of original df as a row
# the new df holds the number of missing rows and percent of missing rows in each variable as columns
def missing_rows(df):
    # taking sum of missing rows for each variable, multiplying by 100 then dividing by total 
    # number of rows in original df to find % of missing rows 
    missing_row_percent = df.isnull().sum() * 100 / len(df)
    # count number of missing values for each variable and sum for each
    missing_row_raw = df.isnull().sum()
    # creating df using series' created by 2 previous variables
    missing_df = pd.DataFrame({'num_rows_missing' : missing_row_raw, 'pct_rows_missing': missing_row_percent})
    # return df
    return missing_df

# passing original df to new function
missing_rows(df)

Unnamed: 0,num_rows_missing,pct_rows_missing
parcelid,0,0.000000
typeconstructiontypeid,77357,99.712555
storytypeid,77530,99.935550
propertylandusetypeid,0,0.000000
heatingorsystemtypeid,28008,36.102088
...,...,...
buildingclassdesc,77565,99.980665
heatingorsystemdesc,28008,36.102088
propertylandusedesc,0,0.000000
storydesc,77530,99.935550


Missing values will be handled on a case by case basis.
- if whole column is missing, drop the column
- if small % missing < 5%, drop
- if large percent missing =>5%, fill with mean (we're going to remove or transform outliers so we aren't worried about the average being offset too heavily by outliers)

### 4) Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, and number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

In [73]:
# creating function that will create df to store num of missing cols and percent of missing cols in a passed DF
def missing_cols(df):
    # df.isna() displays original df frame with true or false for each value as to whether the value is null
    # .any() creates a series and for each column, shows true if there are any nulls within it (since its being used on the .isna DF)
    # df.loc[ : ] ~~~~~~~~ .count() means we're looking at every row to count the number of null values in each row 
    # using the rest of the code explained above
    num_cols_missing = df.loc[:, df.isna().any()].count()
    # dividing the counts above by the length of the index of the dataframe (ie. the number of rows)
    # to get the percent of missing rows and then rounding to 3 decimal places
    pct_cols_missing = round(num_cols_missing / len(df.index),3)
    # creating dataframe using series' from above
    missing_cols_df = pd.DataFrame({'num_cols_missing': num_cols_missing, 'pct_cols_missing': pct_cols_missing})
    # returning DF
    return missing_cols_df

# previewing df
missing_cols(df).head()

Unnamed: 0,num_cols_missing,pct_cols_missing
typeconstructiontypeid,223,0.003
storytypeid,50,0.001
heatingorsystemtypeid,49572,0.639
buildingclasstypeid,15,0.0
architecturalstyletypeid,207,0.003


## PREPARE

### 5) Remove any properties that are likely to be something other than single unit properties. (e.g. no duplexes, no land/lot, ...). There are multiple ways to estimate that a property is a single unit, and there is not a single "right" answer.  But for this exercise, do not purely filter by unitcnt as we did previously. Add some new logic that will reduce the number of properties that are falsely removed. You might want to use # bedrooms, square feet, unit type or the like to then identify those with unitcnt not defined.

We can use the property land use codes to filter for only single unit properties.
The list below contains all of the land use codes that could be considered single unit properties.

- 261 Single Family Residential
- 263 Mobile Home
- 264 Townhouse
- 266 Condominium
- 273 Bungalow
- 275 Manufactured, Modular, Prefabricated Homes
- 276 Patio Home
- 279 Inferred Single Family Residential

In [17]:
# using | (or) to filter properties based on land use type id
# filtering in any properties that have a single unit land type code
def get_single_units(df):
    singles = df[(df.propertylandusetypeid == 261) | (df.propertylandusetypeid == 263) | (df.propertylandusetypeid == 264) \
                 | (df.propertylandusetypeid == 266) | (df.propertylandusetypeid == 273) | (df.propertylandusetypeid == 275) \
                 | (df.propertylandusetypeid == 276) | (df.propertylandusetypeid == 279)]
    return singles

In [18]:
# previewing df
get_single_units(df).head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,261.0,,,,,1727539,,...,0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,261.0,,,,,1387261,,...,1,0.055619,2017-01-01,,,,,Single Family Residential,,
2,14186244,,,261.0,,,,,11677,,...,2,0.005383,2017-01-01,,,,,Single Family Residential,,
3,12177905,,,261.0,2.0,,,,2288172,,...,3,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,10887214,,,266.0,2.0,,,1.0,1970746,,...,4,0.00694,2017-01-01,Central,,,Central,Condominium,,


## 6) Create a function that will drop rows or columns based on the percent of values that are missing: handle_missing_values(df, prop_required_column, prop_required_row).

- The input:
    - A dataframe
    - A number between 0 and 1 that represents the proportion, for each column, of rows with non-missing values required to keep the column. i.e. if prop_required_column = .6, then you are requiring a column to have at least 60% of values not-NA (no more than 40% missing).
    - A number between 0 and 1 that represents the proportion, for each row, of columns/variables with non-missing values required to keep the row. For example, if prop_required_row = .75, then you are requiring a row to have at least 75% of variables with a non-missing value (no more that 25% missing).

- The output:
    - The dataframe with the columns and rows dropped as indicated. Be sure to drop the columns prior to the rows in your function.
    - hint:
    - Look up the dropna documentation.
    - You will want to compute a threshold from your input values (prop_required) and total number of rows or columns.
    - Make use of inplace, i.e. inplace=True/False.

- Decide how to handle the remaining missing values:
    - Fill with constant value.
    - Impute with mean, median, mode.
    - Drop row/column

In [26]:
# creating function that takes 3 arguments: 
# dataframe, % of a column that must be non-null for it be accepted, % of each row that must be non-null for it to be accepted
def handle_missing_values(df, prop_column, prop_row):
    threshold = int(round(prop_column*len(df.index),0))
    df.dropna(axis=1, thresh = threshold, inplace=True)
    threshold = int(round(prop_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [28]:
# running function
handle_missing_values(df, .65, .70)

Unnamed: 0,parcelid,propertylandusetypeid,id,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,id.1,logerror,transactiondate,propertylandusedesc
0,14297519,261.0,1727539,3.5,4.0,3.5,3100.0,3100.0,6059.0,3.0,...,485713.0,1023282.0,2016.0,537569.0,11013.72,6.059063e+13,0,0.025595,2017-01-01,Single Family Residential
1,17052889,261.0,1387261,1.0,2.0,1.0,1465.0,1465.0,6111.0,1.0,...,88000.0,464000.0,2016.0,376000.0,5672.48,6.111001e+13,1,0.055619,2017-01-01,Single Family Residential
2,14186244,261.0,11677,2.0,3.0,2.0,1243.0,1243.0,6059.0,2.0,...,85289.0,564778.0,2016.0,479489.0,6488.30,6.059022e+13,2,0.005383,2017-01-01,Single Family Residential
3,12177905,261.0,2288172,3.0,4.0,3.0,2376.0,2376.0,6037.0,3.0,...,108918.0,145143.0,2016.0,36225.0,1777.51,6.037300e+13,3,-0.103410,2017-01-01,Single Family Residential
4,10887214,266.0,1970746,3.0,3.0,3.0,1312.0,1312.0,6037.0,3.0,...,73681.0,119407.0,2016.0,45726.0,1533.89,6.037124e+13,4,0.006940,2017-01-01,Condominium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77575,11000655,261.0,673515,2.0,2.0,2.0,1286.0,1286.0,6037.0,2.0,...,70917.0,354621.0,2016.0,283704.0,4478.43,6.037101e+13,77609,0.020615,2017-09-20,Single Family Residential
77576,17239384,261.0,2968375,2.0,4.0,2.0,1612.0,1612.0,6111.0,2.0,...,50683.0,67205.0,2016.0,16522.0,1107.48,6.111008e+13,77610,0.013209,2017-09-21,Single Family Residential
77577,12773139,261.0,1843709,1.0,3.0,1.0,1032.0,1032.0,6037.0,1.0,...,32797.0,49546.0,2016.0,16749.0,876.43,6.037434e+13,77611,0.037129,2017-09-21,Single Family Residential
77578,12826780,261.0,1187175,2.0,3.0,2.0,1762.0,1762.0,6037.0,2.0,...,140000.0,522000.0,2016.0,382000.0,6317.15,6.037503e+13,77612,0.007204,2017-09-25,Single Family Residential


### 6) wrangle_zillow.py

- Functions of the work above needed to acquire and prepare a new sample of data.

In [29]:
print('all functions added to wrangle_zillow.py file')

all functions added to wrangle_zillow.py file


# Mall Customers

## notebook

### Acquire data from mall_customers.customers in mysql database.

In [80]:
# creating function to acquire mall data
def new_mall_data():
    '''This function reads the mall customer data from the Codeup db into a df, writes it to a csv file and returns the df'''
    sql_query = 'SELECT * FROM customers'
    df = pd.read_sql(sql_query, get_connection('mall_customers'))
    df.to_csv('mall_customers_df.csv')
    return df

# storing mall data as variable
df = new_mall_data()

# previewing data
df.head()

Unnamed: 0,customer_id,gender,age,annual_income,spending_score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


### Summarize data (include distributions and descriptive statistics).

In [8]:
# using describe to get descriptive stats
df.describe()

Unnamed: 0,customer_id,age,annual_income,spending_score
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [9]:
# using dtypes to check data type of each column
df.dtypes

customer_id        int64
gender            object
age                int64
annual_income      int64
spending_score     int64
dtype: object

In [10]:
# checking for null values, all non-null counts match total rows so there are no nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     200 non-null    int64 
 1   gender          200 non-null    object
 2   age             200 non-null    int64 
 3   annual_income   200 non-null    int64 
 4   spending_score  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [12]:
# checking number of unique values in each column
df.nunique()

customer_id       200
gender              2
age                51
annual_income      64
spending_score     84
dtype: int64

In [13]:
# checking number of rows and columns
df.shape

(200, 5)

### Detect outliers using IQR.

In [5]:
def get_upper_outliers(s, k):
    '''
    Function takes in a series (s) and cuttoff value (k). 
    If a value in the series is an outlier, it returns a number that represents how far above the value is from the upper bound
    or 0 if the number is not an outlier.
    '''
    # creating 2 variables that represent the 1st and 3rd quantile of the given series
    q1, q3 = s.quantile([.25, .75])
    # calculating IQR
    iqr = q3 - q1
    # calculating upper bound
    upper_bound = q3 + k * iqr
    # returning series described in doc string
    return s.apply(lambda x: max([x - upper_bound, 0]))

In [None]:
# storing columns to be checked for upper outliers
age = df.age
annual_income = df.annual_income
spending_score = df.spending_score

In [24]:
# checking for outliers in age column
(get_upper_outliers(age, 1.5)).sum()

0

In [25]:
# checking for outliers in annual income column
(get_upper_outliers(annual_income, 1.5)).sum()

8.5

In [27]:
# # checking for spending_score in age column
(get_upper_outliers(spending_score, 1.5)).sum()

0

In [28]:
print(f'Using our outliers function, we can confirm that only the annual_income column has outliers.')

Using our outliers function, we can confirm that only the annual_income column has outliers.


### Split data (train, validate, and test split).

In [56]:
def prep_mall_data(df, target):
    """
    This function accepts a dataframe and returns it split into 3 appropriately proportioned DFs for training, validating, and testing purposes.
    """
    # splitting data
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
    
    # specifying which columns to keep in outputted dataframe
    # x = features | y = target variable
    X_train = train.drop(columns=[target])
    y_train = train[[target]]
    
    X_validate = validate.drop(columns=[target])
    y_validate = validate[[target]]
    
    X_test = test.drop(columns=[target])
    y_test = test[[target]]
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [59]:
# using function to split data
X_train, y_train, X_validate, y_validate, X_test, y_test = prep_mall_data(df, 'spending_score')

# previewing data
X_train.head()

Unnamed: 0,customer_id,gender,age,annual_income,male
123,124,Male,39,69,1
76,77,Female,45,54,0
171,172,Male,28,87,1
10,11,Male,67,19,1
35,36,Female,21,33,0


### Encode categorical columns using a one hot encoder.

In [48]:
def encode_gender(df):
    """
    Function accepts a DF with a column named "gender", and returns the dataframe with a new column that is an encoded version of the gender column.
    1 = Male | 0 = Female
    """
    # creating label encoder object
    label_encoder = LabelEncoder()
    # fitting object to gender column
    gender_encoded = label_encoder.fit_transform(df.gender)
    # adding column "male" to passed df and storing encoded gender values in it
    df['male'] = gender_encoded
    # returning df
    return df

In [49]:
# testing gender encoding function on df
encode_gender(df)

Unnamed: 0,customer_id,gender,age,annual_income,spending_score,male
0,1,Male,19,15,39,1
1,2,Male,21,15,81,1
2,3,Female,20,16,6,0
3,4,Female,23,16,77,0
4,5,Female,31,17,40,0
...,...,...,...,...,...,...
195,196,Female,35,120,79,0
196,197,Female,45,126,28,0
197,198,Male,32,126,74,1
198,199,Male,32,137,18,1


### Handles missing values.

In [79]:
def handle_missing_values(df, prop_required_column = .4, prop_required_row = .6):
    '''
    Function accepts 3 values: dataframe, and two separate values from 0 - 1
    The passed dataframe will have rows and columns removed based on the amount of null values in each
    The first numeric value passed specifies what % of each columns values must be non-null to avoid being dropped
    The second numeric value passed specifies what % of each row values must be non-null to avoid being dropped
    '''
    # dropping columns based on % of missing values
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    # dropping rows based on % of missing values
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    # returning updated df
    return df

In [52]:
# testing missing value handler function
handle_missing_values(df)

Unnamed: 0,customer_id,gender,age,annual_income,spending_score,male
0,1,Male,19,15,39,1
1,2,Male,21,15,81,1
2,3,Female,20,16,6,0
3,4,Female,23,16,77,0
4,5,Female,31,17,40,0
...,...,...,...,...,...,...
195,196,Female,35,120,79,0
196,197,Female,45,126,28,0
197,198,Male,32,126,74,1
198,199,Male,32,137,18,1


### Scaling

In [87]:
# creating function to scale mall data
def mall_scaler(X_train, X_validate, X_test):
    """
    Function accepts 3 mall dataframes output by prep_mall_data function and returns them with the age and annual income columns scaled.
    Spending score is not scaled because it is our target variable.
    """
    # creating scaler object
    scaler = sklearn.preprocessing.MinMaxScaler()
    
    # fitting scaler to x train 
    scaler.fit(X_train[['age', 'annual_income']])

    # scaling data and saving to new dataframes
    X_train_scaled = pd.DataFrame(scaler.transform(X_train[['age', 'annual_income']]))
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate[['age', 'annual_income']]))
    X_test_scaled = pd.DataFrame(scaler.transform(X_test[['age', 'annual_income']]))
    
    # renaming columns in new dataframes as they were given integers as names
    X_train_scaled.rename(columns = {0: 'age', 1: 'annual_income'}, inplace=True)
    X_validate_scaled.rename(columns = {0: 'age', 1: 'annual_income'}, inplace=True)
    X_test_scaled.rename(columns = {0: 'age', 1: 'annual_income'}, inplace=True)
    
    # returning new scaled dataframes
    return X_train_scaled, X_validate_scaled, X_test_scaled

In [88]:
# using function to created scaled dataframes
X_train_scaled, X_validate_scaled, X_test_scaled = mall_scaler(X_train, X_validate, X_test)

In [90]:
# previewing data
X_train_scaled.head()

Unnamed: 0,age,annual_income
0,0.403846,0.442623
1,0.519231,0.319672
2,0.192308,0.590164
3,0.942308,0.032787
4,0.057692,0.147541


## wrangle_mall.py

- Acquire data from mall_customers.customers in mysql database.
- Split the data into train, validate, and split
- One-hot-encoding
- Missing values
- Scaling

In [50]:
print(f'All of the prep processes listed above have been turned into functions and stored in a wrangle_mall file.')

All of the prep processes listed above have been turned into functions and stored in a wrangle_mall file.
