Using the Titanic dataset:

In [17]:
#Use the function defined in acquire.py to load the Titanic data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import MinMaxScaler


# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

# import our own acquire module
import acquire
import env
import prepare

directory = os.getcwd()

titanic_query = 'select * from passengers'
titanic = acquire.get_titanic_data(titanic_query, directory, filename = 'titanic.csv')
titanic.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [11]:
#Use the function defined in prepare.py to prepare the titanic data.
train, val, test = prepare.split_data(prepare.wrangle_data('titanic'), 'titanic')
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
474,0,3,female,22.0,0,0,9.8375,Southampton,1
370,1,1,male,25.0,1,0,55.4417,Cherbourg,0
573,1,3,female,30.0,0,0,7.75,Queenstown,1
110,0,1,male,47.0,0,0,52.0,Southampton,1
167,0,3,female,45.0,1,4,27.9,Southampton,0


In [12]:
#Encode the categorical columns on train dataset. Create dummy variables of the categorical columns and concatenate them onto the dataframe. 
#Remove the columns they are replacing. Repeat on validate and test.

train.loc[:, 'is_female'] = (train.sex == 'female').astype(int) #### This is to convert 'sex' to a digit format in the column 'is_female'.
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,is_female
474,0,3,female,22.0,0,0,9.8375,Southampton,1,1
370,1,1,male,25.0,1,0,55.4417,Cherbourg,0,0
573,1,3,female,30.0,0,0,7.75,Queenstown,1,1
110,0,1,male,47.0,0,0,52.0,Southampton,1,0
167,0,3,female,45.0,1,4,27.9,Southampton,0,1


In [14]:
train[['is_queenstown', 'is_southampton']] = pd.get_dummies(train.embark_town,
               drop_first=True).astype(int).values

#### To convert 'embark_town' to a digit format, I use get_dummies to break each option to its own nominal column.
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,is_female,is_queenstown,is_southampton
474,0,3,female,22.0,0,0,9.8375,Southampton,1,1,0,1
370,1,1,male,25.0,1,0,55.4417,Cherbourg,0,0,0,0
573,1,3,female,30.0,0,0,7.75,Queenstown,1,1,1,0
110,0,1,male,47.0,0,0,52.0,Southampton,1,0,0,1
167,0,3,female,45.0,1,4,27.9,Southampton,0,1,0,1


In [15]:
continuous_features = ['age', 'fare'] # The values in these columns need to be scaled

In [None]:
# sklearn process:
# make an object
# fit an object
# use an object

In [18]:
# make an object:
scaler = MinMaxScaler()

In [19]:
# fit the thing
# if you are using an sklearn object, make sure you only call fit on train!
scaler.fit(train[continuous_features])

In [20]:
# Use the thing
train[['age_scaled', 'fare_scaled']] = scaler.transform(train[continuous_features])

In [21]:
train.head() ##### The values have been scaled.

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,is_female,is_queenstown,is_southampton,age_scaled,fare_scaled
474,0,3,female,22.0,0,0,9.8375,Southampton,1,1,0,1,0.303285,0.019202
370,1,1,male,25.0,1,0,55.4417,Cherbourg,0,0,0,0,0.345941,0.108215
573,1,3,female,30.0,0,0,7.75,Queenstown,1,1,1,0,0.417034,0.015127
110,0,1,male,47.0,0,0,52.0,Southampton,1,0,0,1,0.658752,0.101497
167,0,3,female,45.0,1,4,27.9,Southampton,0,1,0,1,0.630314,0.054457


In [23]:
#### Now to drop the columns that are not needed for the Machine Learning algorithms
preprocessed_train = train.drop(columns=['sex', 'age', 'fare', 'embark_town'])
preprocessed_train.head()

Unnamed: 0,survived,pclass,sibsp,parch,alone,is_female,is_queenstown,is_southampton,age_scaled,fare_scaled
474,0,3,0,0,1,1,0,1,0.303285,0.019202
370,1,1,1,0,0,0,0,0,0.345941,0.108215
573,1,3,0,0,1,1,1,0,0.417034,0.015127
110,0,1,0,0,1,0,0,1,0.658752,0.101497
167,0,3,1,4,0,1,0,1,0.630314,0.054457


In [25]:
for df in [train, val, test]:
    df[['is_queenstown', 'is_southampton']] = pd.get_dummies(df.embark_town,
               drop_first=True).astype(int).values
scaler = MinMaxScaler()
scaler.fit(train[continuous_features])
for df in [train, val, test]:
    df[[continuous_features[0]+'_scaled', continuous_features[1]+'_scaled']] = \
    scaler.transform(df[continuous_features])

In [27]:
#Create a function named preprocess_titanic that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.

def preprocess_titanic(train, val, test, continuous_features=['age', 'fare']):
    '''
    preprocess titanic will encode any categorical features
    and proceed forward with using a minmax scaler to transform 
    continuous variables.
    
    return: three dataframes, train, validate, and test, preprocessed for modeling
    '''
    # iterate through our three dataframes
    for df in [train, val, test]:
        # assign out the encoded categoricals for embark_town
        # in the same way on all three datasets
        df[['is_queenstown', 'is_southampton']] = pd.get_dummies(df.embark_town,
               drop_first=True).astype(int).values
    # create a single scaler object
    scaler = MinMaxScaler()
    # fit the single scaler just once to train
    scaler.fit(train[continuous_features])
    # apply that transformation to all three data sets
    # using the same syntax, which conconates
    # '_scaled' to each feature name that was fed in
    for df in [train, val, test]:
        df[[continuous_features[0]+'_scaled', continuous_features[1]+'_scaled']] = \
        scaler.transform(df[continuous_features])
    for df in [train, val, test]:
        df['is_female'] = np.where(df['sex'] == 'female', 1, 0)
    preprocessed_dfs = []
    for df in [train, val, test]:
        preprocessed_dfs.append(df.drop(columns=['sex', 'age', 'fare', 'embark_town']))
    return preprocessed_dfs

In [28]:
train, val, test = prepare.split_data(prepare.wrangle_data('titanic'), 'titanic')

In [32]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,is_queenstown,is_southampton,age_scaled,fare_scaled,is_female
474,0,3,female,22.0,0,0,9.8375,Southampton,1,0,1,0.303285,0.019202,1
370,1,1,male,25.0,1,0,55.4417,Cherbourg,0,0,0,0.345941,0.108215,0
573,1,3,female,30.0,0,0,7.75,Queenstown,1,1,0,0.417034,0.015127,1
110,0,1,male,47.0,0,0,52.0,Southampton,1,0,1,0.658752,0.101497,0
167,0,3,female,45.0,1,4,27.9,Southampton,0,0,1,0.630314,0.054457,1


In [31]:
train_processed, val_processed, test_processed = preprocess_titanic(train, val, test)
train_processed.head()

Unnamed: 0,survived,pclass,sibsp,parch,alone,is_queenstown,is_southampton,age_scaled,fare_scaled,is_female
474,0,3,0,0,1,0,1,0.303285,0.019202,1
370,1,1,1,0,0,0,0,0.345941,0.108215,0
573,1,3,0,0,1,1,0,0.417034,0.015127,1
110,0,1,0,0,1,0,1,0.658752,0.101497,0
167,0,3,1,4,0,0,1,0.630314,0.054457,1


Using the Telco dataset

In [34]:
#Use the function defined in acquire.py to load the Telco data.
import acquire
from acquire import telco_query, directory

telco = acquire.get_telco_data(telco_query, directory)
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [35]:
#Use the function defined in prepare.py to prepare the Telco data.

train, val, test = prepare.split_data(prepare.wrangle_data('telco'), 'telco')
train.head()

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9705-IOVQQ,Male,1,Yes,Yes,61,Yes,Yes,No,Yes,No,No,Yes,Yes,No,99.0,5969.3,No,One year,Fiber optic,Electronic check
0635-WKOLD,Male,0,Yes,No,40,Yes,Yes,No,Yes,No,Yes,Yes,No,No,70.75,2921.75,No,One year,DSL,Credit card (automatic)
0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,No,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)
4581-LNWUM,Female,0,No,No,13,No,No phone service,No,No,Yes,No,Yes,Yes,No,49.15,649.4,No,Month-to-month,DSL,Electronic check
6297-NOOPG,Female,0,Yes,No,70,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,110.5,7752.05,No,Two year,Fiber optic,Electronic check


In [36]:
#Encode the categorical columns on train.
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3943 entries, 9705-IOVQQ to 3318-NMQXL
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 3943 non-null   object 
 1   senior_citizen         3943 non-null   int64  
 2   partner                3943 non-null   object 
 3   dependents             3943 non-null   object 
 4   tenure                 3943 non-null   int64  
 5   phone_service          3943 non-null   object 
 6   multiple_lines         3943 non-null   object 
 7   online_security        3943 non-null   object 
 8   online_backup          3943 non-null   object 
 9   device_protection      3943 non-null   object 
 10  tech_support           3943 non-null   object 
 11  streaming_tv           3943 non-null   object 
 12  streaming_movies       3943 non-null   object 
 13  paperless_billing      3943 non-null   object 
 14  monthly_charges        3943 non-null   float64

In [58]:
cat_cols, num_cols = [], []
for col in train.columns:
    if train[col].dtype == 'O':
        cat_cols.append(col)
    else:
        if train[col].nunique() < 10:
            cat_cols.append(col)
        else:
            num_cols.append(col)

In [63]:
train.head(3)

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,is_female
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9705-IOVQQ,Male,1,Yes,Yes,61,Yes,Yes,No,Yes,No,...,Yes,Yes,No,99.0,5969.3,No,One year,Fiber optic,Electronic check,0
0635-WKOLD,Male,0,Yes,No,40,Yes,Yes,No,Yes,No,...,Yes,No,No,70.75,2921.75,No,One year,DSL,Credit card (automatic),0
0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,No,...,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic),0


In [59]:
cat_cols

['gender',
 'senior_citizen',
 'partner',
 'dependents',
 'phone_service',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing',
 'churn',
 'contract_type',
 'internet_service_type',
 'payment_type']

In [70]:
train.loc[:, 'is_female'] = (train.gender == 'female').astype(int) #### This is to convert 'sex' to a digit format in the column 'is_female'.
train.loc[:, 'is_partner'] = (train.partner == 'Yes').astype(int)
train.loc[:, 'has_dependents'] = (train.dependents == 'Yes').astype(int)
train.loc[:, 'has_phone_service'] = (train.phone_service == 'Yes').astype(int)
train.loc[:, 'has_multiple_lines'] = (train.multiple_lines == 'Yes').astype(int)
train.loc[:, 'has_online_security'] = (train.online_security == 'Yes').astype(int)
train.loc[:, 'has_online_backup'] = (train.online_backup == 'Yes').astype(int)
train.loc[:, 'has_device_protection'] = (train.device_protection == 'Yes').astype(int)
train.loc[:, 'has_tech_support'] = (train.tech_support == 'Yes').astype(int)
train.loc[:, 'has_streaming_tv'] = (train.streaming_tv == 'Yes').astype(int)
train.loc[:, 'has_streaming_movies'] = (train.streaming_movies == 'Yes').astype(int)
train.loc[:, 'has_paperless_billing'] = (train.paperless_billing == 'Yes').astype(int) 
train.loc[:, 'has_churn'] = (train.churn == 'Yes').astype(int) 
train.loc[:, 'contract_types'] = (train.contract_type == 'One year').astype(int) 
train.loc[:, 'internet_service_types'] = (train.internet_service_type == 'DSL').astype(int) 
train.loc[:, 'payment_types'] = (train.payment_type == 'Electronic check').astype(int)

train.head(3)

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,has_online_backup,has_device_protection,has_tech_support,has_streaming_tv,has_streaming_movies,has_paperless_billing,has_churn,contract_types,internet_service_types,payment_types
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9705-IOVQQ,Male,1,Yes,Yes,61,Yes,Yes,No,Yes,No,...,1,0,0,1,1,0,0,1,0,1
0635-WKOLD,Male,0,Yes,No,40,Yes,Yes,No,Yes,No,...,1,0,1,1,0,0,0,1,1,0
0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,No,...,0,0,0,0,0,0,1,0,1,0


In [69]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3943 entries, 9705-IOVQQ to 3318-NMQXL
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  3943 non-null   object 
 1   senior_citizen          3943 non-null   int64  
 2   partner                 3943 non-null   object 
 3   dependents              3943 non-null   object 
 4   tenure                  3943 non-null   int64  
 5   phone_service           3943 non-null   object 
 6   multiple_lines          3943 non-null   object 
 7   online_security         3943 non-null   object 
 8   online_backup           3943 non-null   object 
 9   device_protection       3943 non-null   object 
 10  tech_support            3943 non-null   object 
 11  streaming_tv            3943 non-null   object 
 12  streaming_movies        3943 non-null   object 
 13  paperless_billing       3943 non-null   object 
 14  monthly_charges         3943 n

Unnamed: 0_level_0,senior_citizen,tenure,monthly_charges,total_charges,is_female,is_partner,has_dependents,has_phone_service,has_multiple_lines,has_online_security,has_online_backup,has_device_protection,has_tech_support,has_streaming_tv,has_streaming_movies,has_paperless_billing,has_churn,contract_types,internet_service_types,payment_types
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9705-IOVQQ,1,61,99.0,5969.3,0,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1
0635-WKOLD,0,40,70.75,2921.75,0,1,0,1,1,0,1,0,1,1,0,0,0,1,1,0
0032-PGELS,0,1,30.5,30.5,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0
4581-LNWUM,0,13,49.15,649.4,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,1
6297-NOOPG,0,70,110.5,7752.05,0,1,0,1,1,0,1,1,1,1,1,1,0,0,0,1


In [None]:
#### tenure, monthly_charges, total_charges ALL need to be scaled

In [84]:
continuous_features = ['tenure', 'monthly_charges', 'total_charges']
#### Make an object
scaler = MinMaxScaler()
#### Fit the object
scaler.fit(train[continuous_features])
#### Use the object
train[['tenure_scaled', 'monthly_charges_scaled', 'total_charges_scaled']] = scaler.transform(train[continuous_features])
train.head()



Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,has_streaming_tv,has_streaming_movies,has_paperless_billing,has_churn,contract_types,internet_service_types,payment_types,tenure_scaled,monthly_charges_scaled,total_charges_scaled
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9705-IOVQQ,Male,1,Yes,Yes,61,Yes,Yes,No,Yes,No,...,1,1,0,0,1,0,1,0.847222,0.803483,0.071021
0635-WKOLD,Male,0,Yes,No,40,Yes,Yes,No,Yes,No,...,1,0,0,0,1,1,0,0.555556,0.522388,0.034762
0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,No,...,0,0,0,1,0,1,0,0.013889,0.121891,0.000363
4581-LNWUM,Female,0,No,No,13,No,No phone service,No,No,Yes,...,1,1,0,0,0,1,1,0.180556,0.307463,0.007726
6297-NOOPG,Female,0,Yes,No,70,Yes,Yes,No,Yes,Yes,...,1,1,1,0,0,0,1,0.972222,0.91791,0.092231


In [85]:
preprocessed_train = train.drop(columns=['tenure', 'monthly_charges', 'total_charges','gender', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'churn', 'contract_type', 'internet_service_type', 'payment_type'])
preprocessed_train.head()

Unnamed: 0_level_0,senior_citizen,is_female,is_partner,has_dependents,has_phone_service,has_multiple_lines,has_online_security,has_online_backup,has_device_protection,has_tech_support,has_streaming_tv,has_streaming_movies,has_paperless_billing,has_churn,contract_types,internet_service_types,payment_types,tenure_scaled,monthly_charges_scaled,total_charges_scaled
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9705-IOVQQ,1,0,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1,0.847222,0.803483,0.071021
0635-WKOLD,0,0,1,0,1,1,0,1,0,1,1,0,0,0,1,1,0,0.555556,0.522388,0.034762
0032-PGELS,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0.013889,0.121891,0.000363
4581-LNWUM,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,1,0.180556,0.307463,0.007726
6297-NOOPG,0,0,1,0,1,1,0,1,1,1,1,1,1,0,0,0,1,0.972222,0.91791,0.092231


In [117]:
#Create a function named prep_telco that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.

def preprocess_telco(train, val, test, continuous_features=['tenure', 'monthly_charges', 'total_charges']):
    '''
    preprocess telco will encode any categorical features
    and proceed forward with using a minmax scaler to transform 
    continuous variables.
    
    return: three dataframes, train, validate, and test, preprocessed for modeling
    '''
    # iterate through our three dataframes
    for df in [train, val, test]:
        # in the same way on all three datasets
        df['is_female'] = (train.gender == 'female').astype(int) 
        df['is_partner'] = (train.partner == 'Yes').astype(int)
        df['has_dependents'] = (train.dependents == 'Yes').astype(int)
        df['has_phone_service'] = (train.phone_service == 'Yes').astype(int)
        df['has_multiple_lines'] = (train.multiple_lines == 'Yes').astype(int)
        df['has_online_security'] = (train.online_security == 'Yes').astype(int)
        df['has_online_backup'] = (train.online_backup == 'Yes').astype(int)
        df['has_device_protection'] = (train.device_protection == 'Yes').astype(int)
        df['has_tech_support'] = (train.tech_support == 'Yes').astype(int)
        df['has_streaming_tv'] = (train.streaming_tv == 'Yes').astype(int)
        df['has_streaming_movies'] = (train.streaming_movies == 'Yes').astype(int)
        df['has_paperless_billing'] = (train.paperless_billing == 'Yes').astype(int)
        df['has_churn'] = (train.churn == 'Yes').astype(int) 
        df['contract_types'] = (train.contract_type == 'One year').astype(int)
        df['internet_service_types'] = (train.internet_service_type == 'DSL').astype(int) 
        df['payment_types'] = (train.payment_type == 'Electronic check').astype(int)
       
    

    continuous_features = ['tenure', 'monthly_charges', 'total_charges']    
    # create a single scaler object
    scaler = MinMaxScaler()
    # fit the single scaler just once to train
    scaler.fit(train[continuous_features])
    # apply that transformation to all three data sets
    # using the same syntax, which conconates
    # '_scaled' to each feature name that was fed in
    for df in [train, val, test]:
        
        scaler.transform(df[continuous_features]) 
    preprocessed_dfs = []
    for df in [train, val, test]:
        preprocessed_dfs.append(df.drop(columns=['tenure', 'monthly_charges', 'total_charges','gender', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'churn', 'contract_type', 'internet_service_type', 'payment_type']))
    return preprocessed_dfs

In [120]:
train_processed, val_processed, test_processed = preprocess_telco(train, val, test)
train_processed.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3943 entries, 9705-IOVQQ to 3318-NMQXL
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   senior_citizen          3943 non-null   int64  
 1   is_female               3943 non-null   int64  
 2   is_partner              3943 non-null   int64  
 3   has_dependents          3943 non-null   int64  
 4   has_phone_service       3943 non-null   int64  
 5   has_multiple_lines      3943 non-null   int64  
 6   has_online_security     3943 non-null   int64  
 7   has_online_backup       3943 non-null   int64  
 8   has_device_protection   3943 non-null   int64  
 9   has_tech_support        3943 non-null   int64  
 10  has_streaming_tv        3943 non-null   int64  
 11  has_streaming_movies    3943 non-null   int64  
 12  has_paperless_billing   3943 non-null   int64  
 13  has_churn               3943 non-null   int64  
 14  contract_types          3943 n