In [27]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import scipy as sp
from pydataset import data
from env import get_db_url, user, password, host
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

import acquire as acq
import prepare as pre
import os
directory = os.getcwd()

## Using the Titanic dataset

#### 1. Use the function defined in `acquire.py` to load the Titanic data.

#### 2. Use the function defined in `prepare.py` to prepare the titanic data.

In [2]:
titanic = acq.get_titanic_data()
titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
titanic = acq.prep_titanic(acq.get_titanic_data())
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,Southampton,1
3,1,1,female,35.0,1,0,53.1,Southampton,0
4,0,3,male,35.0,0,0,8.05,Southampton,1


In [4]:
train, val, test = acq.split_data(titanic,'titanic')
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
829,1,1,female,62.0,0,0,80.0,Southampton,1
463,0,2,male,48.0,0,0,13.0,Southampton,1
228,0,2,male,18.0,0,0,13.0,Southampton,1
374,0,3,female,3.0,3,1,21.075,Southampton,0
494,0,3,male,21.0,0,0,8.05,Southampton,1


#### 3. Encode the categorical columns on train dataset. Create dummy variables of the categorical columns and concatenate them onto the dataframe. Remove the columns they are replacing. Repeat on validate and test.

In [5]:
train.loc[:, 'is_female'] = (train.sex == 'female').astype(int)

In [6]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,is_female
829,1,1,female,62.0,0,0,80.0,Southampton,1,1
463,0,2,male,48.0,0,0,13.0,Southampton,1,0
228,0,2,male,18.0,0,0,13.0,Southampton,1,0
374,0,3,female,3.0,3,1,21.075,Southampton,0,1
494,0,3,male,21.0,0,0,8.05,Southampton,1,0


In [7]:
train[['is_queenstown', 'is_southampton']] = pd.get_dummies(train.embark_town,
               drop_first=True).astype(int).values
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,is_female,is_queenstown,is_southampton
829,1,1,female,62.0,0,0,80.0,Southampton,1,1,0,1
463,0,2,male,48.0,0,0,13.0,Southampton,1,0,0,1
228,0,2,male,18.0,0,0,13.0,Southampton,1,0,0,1
374,0,3,female,3.0,3,1,21.075,Southampton,0,1,0,1
494,0,3,male,21.0,0,0,8.05,Southampton,1,0,0,1


In [8]:
train = train.drop(columns = 'embark_town')

In [9]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,alone,is_female,is_queenstown,is_southampton
829,1,1,female,62.0,0,0,80.0,1,1,0,1
463,0,2,male,48.0,0,0,13.0,1,0,0,1
228,0,2,male,18.0,0,0,13.0,1,0,0,1
374,0,3,female,3.0,3,1,21.075,0,1,0,1
494,0,3,male,21.0,0,0,8.05,1,0,0,1


In [13]:
continuous_features = ['age', 'fare_x_pass']
scaler = MinMaxScaler()

#### 4. Create a function named `preprocess_titanic` that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.

In [15]:
train, val, test = acq.split_data(titanic, 'titanic')
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
829,1,1,female,62.0,0,0,80.0,Southampton,1
463,0,2,male,48.0,0,0,13.0,Southampton,1
228,0,2,male,18.0,0,0,13.0,Southampton,1
374,0,3,female,3.0,3,1,21.075,Southampton,0
494,0,3,male,21.0,0,0,8.05,Southampton,1


In [16]:
def preprocess_titanic(train, val, test, continuous_features=['age', 'fare']):
    '''
    preprocess titanic will encode any categorical features
    and proceed forward with using a minmax scaler to transform 
    continuous variables.
    
    return: three dataframes, train, validate, and test, preprocessed for modeling
    '''
    # iterate through our three dataframes
    for df in [train, val, test]:
        # assign out the encoded categoricals for embark_town
        # in the same way on all three datasets
        df[['is_queenstown', 'is_southampton']] = pd.get_dummies(df.embark_town,
               drop_first=True).astype(int).values
        # added a column that divides fare to the total in tickets bough in that transaction
        titanic['fare_x_pass'] = titanic['fare'] / (titanic['sibsp'] + titanic['parch'] + titanic['alone']).astype(float)
    # create a single scaler object
    scaler = MinMaxScaler()
    # fit the single scaler just once to train
    scaler.fit(train[continuous_features])
    # apply that transformation to all three data sets
    # using the same syntax, which conconates
    # '_scaled' to each feature name that was fed in
    for df in [train, val, test]:
        df[[continuous_features[0]+'_scaled', continuous_features[1]+'_scaled']] = \
        scaler.transform(df[continuous_features])
    for df in [train, val, test]:
        df['is_female'] = np.where(df['sex'] == 'female', 1, 0)
    preprocessed_dfs = []
    for df in [train, val, test]:
        preprocessed_dfs.append(df.drop(columns=['sex', 'age', 'fare', 'embark_town']))
    return preprocessed_dfs

In [17]:
train_processed, val_processed, test_processed = preprocess_titanic(train, val, test)

In [18]:
train_processed.head()

Unnamed: 0,survived,pclass,sibsp,parch,alone,is_queenstown,is_southampton,age_scaled,fare_scaled,is_female
829,1,1,0,0,1,0,1,0.773813,0.15615,1
463,0,2,0,0,1,0,1,0.597889,0.025374,0
228,0,2,0,0,1,0,1,0.22091,0.025374,0
374,0,3,3,1,0,0,1,0.03242,0.041136,1
494,0,3,0,0,1,0,1,0.258608,0.015713,0


## Using the Telco dataset

#### 1. Use the function defined in `acquire.py` to load the Telco data.

In [36]:
telco = acq.get_telco_data()
telco.head()

Unnamed: 0.1,Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,1,2,1,1,0003-MKNFE,Male,0,No,No,9,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,2,1,2,1,0004-TLHLJ,Male,0,No,No,4,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


#### 2. Use the function defined in `prepare.py` to prepare the Telco data.

In [37]:
telco = pre.prep_telco(telco)
telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [38]:
train, val, test = acq.split_data(telco, 'telco')
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
5274,7439-DKZTW,Male,0,No,No,1,Yes,No,No,No,...,No,Yes,No,Yes,80.55,80.55,No,Month-to-month,Fiber optic,Electronic check
374,0557-ASKVU,Female,0,Yes,Yes,18,Yes,No,No,No,...,Yes,No,No,Yes,54.4,957.1,No,One year,DSL,Credit card (automatic)
5162,7255-SSFBC,Male,0,Yes,Yes,72,Yes,Yes,No,Yes,...,Yes,Yes,Yes,Yes,112.25,8041.65,No,Two year,Fiber optic,Bank transfer (automatic)
197,0311-QYWSS,Female,0,No,No,6,Yes,No,Yes,No,...,No,No,No,Yes,49.45,314.6,No,Month-to-month,DSL,Electronic check
1294,1926-QUZNN,Female,0,Yes,No,72,Yes,Yes,No internet service,No internet service,...,No internet service,No internet service,No internet service,Yes,25.25,1841.2,No,Two year,,Bank transfer (automatic)


In [39]:
train.loc[:, 'Month-to-month'] = (train.contract_type == 'Month-to-month').astype(int)
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,Month-to-month
5274,7439-DKZTW,Male,0,No,No,1,Yes,No,No,No,...,Yes,No,Yes,80.55,80.55,No,Month-to-month,Fiber optic,Electronic check,1
374,0557-ASKVU,Female,0,Yes,Yes,18,Yes,No,No,No,...,No,No,Yes,54.4,957.1,No,One year,DSL,Credit card (automatic),0
5162,7255-SSFBC,Male,0,Yes,Yes,72,Yes,Yes,No,Yes,...,Yes,Yes,Yes,112.25,8041.65,No,Two year,Fiber optic,Bank transfer (automatic),0
197,0311-QYWSS,Female,0,No,No,6,Yes,No,Yes,No,...,No,No,Yes,49.45,314.6,No,Month-to-month,DSL,Electronic check,1
1294,1926-QUZNN,Female,0,Yes,No,72,Yes,Yes,No internet service,No internet service,...,No internet service,No internet service,Yes,25.25,1841.2,No,Two year,,Bank transfer (automatic),0


#### 3. Encode the categorical columns on train.
- Encode at least one column using `.replace`
- Encode at least one column using `.map`
- Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.


#### 4. Repeat the same steps on validate and test.

#### 5. Create a function named `prep_telco` that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.