In [36]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import scipy as sp
from pydataset import data
from env import get_db_url, user, password, host
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

import acquire as acq
import os
directory = os.getcwd()

## Using the Titanic dataset

#### 1. Use the function defined in `acquire.py` to load the Titanic data.

#### 2. Use the function defined in `prepare.py` to prepare the titanic data.

In [44]:
titanic = acq.get_titanic_data()
titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [43]:
titanic = acq.prep_titanic(acq.get_titanic_data())
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,fare_x_pass
0,0,3,male,22.0,1,0,7.25,Southampton,0,7.25
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0,71.2833
2,1,3,female,26.0,0,0,7.925,Southampton,1,7.925
3,1,1,female,35.0,1,0,53.1,Southampton,0,53.1
4,0,3,male,35.0,0,0,8.05,Southampton,1,8.05


In [20]:
train, val, test = acq.split_data(titanic,'titanic')
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,fare_x_pass
829,1,1,female,62.0,0,0,80.0,Southampton,1,80.0
463,0,2,male,48.0,0,0,13.0,Southampton,1,13.0
228,0,2,male,18.0,0,0,13.0,Southampton,1,13.0
374,0,3,female,3.0,3,1,21.075,Southampton,0,5.26875
494,0,3,male,21.0,0,0,8.05,Southampton,1,8.05


#### 3. Encode the categorical columns on train dataset. Create dummy variables of the categorical columns and concatenate them onto the dataframe. Remove the columns they are replacing. Repeat on validate and test.

In [None]:
train.loc[:, 'is_female'] = (train.sex == 'female').astype(int)

In [27]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embark_town,alone,fare_x_pass,is_female
829,1,1,62.0,0,0,80.0,Southampton,1,80.0,1
463,0,2,48.0,0,0,13.0,Southampton,1,13.0,0
228,0,2,18.0,0,0,13.0,Southampton,1,13.0,0
374,0,3,3.0,3,1,21.075,Southampton,0,5.26875,1
494,0,3,21.0,0,0,8.05,Southampton,1,8.05,0


In [29]:
train[['is_queenstown', 'is_southampton']] = pd.get_dummies(train.embark_town,
               drop_first=True).astype(int).values
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embark_town,alone,fare_x_pass,is_female,is_queenstown,is_southampton
829,1,1,62.0,0,0,80.0,Southampton,1,80.0,1,0,1
463,0,2,48.0,0,0,13.0,Southampton,1,13.0,0,0,1
228,0,2,18.0,0,0,13.0,Southampton,1,13.0,0,0,1
374,0,3,3.0,3,1,21.075,Southampton,0,5.26875,1,0,1
494,0,3,21.0,0,0,8.05,Southampton,1,8.05,0,0,1


In [None]:
train = train.drop(columns = 'embark_town')

In [33]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,fare_x_pass,is_female,is_queenstown,is_southampton
829,1,1,62.0,0,0,80.0,1,80.0,1,0,1
463,0,2,48.0,0,0,13.0,1,13.0,0,0,1
228,0,2,18.0,0,0,13.0,1,13.0,0,0,1
374,0,3,3.0,3,1,21.075,0,5.26875,1,0,1
494,0,3,21.0,0,0,8.05,1,8.05,0,0,1


In [37]:
continuous_features = ['age', 'fare_x_pass']
scaler = MinMaxScaler()

In [38]:
def preprocess_titanic(train, val, test, continuous_features=['age', 'fare']):
    '''
    preprocess titanic will encode any categorical features
    and proceed forward with using a minmax scaler to transform 
    continuous variables.
    
    return: three dataframes, train, validate, and test, preprocessed for modeling
    '''
    # iterate through our three dataframes
    for df in [train, val, test]:
        # assign out the encoded categoricals for embark_town
        # in the same way on all three datasets
        df[['is_queenstown', 'is_southampton']] = pd.get_dummies(df.embark_town,
               drop_first=True).astype(int).values
        # added a column that divides fare to the total in tickets bough in that transaction
        titanic['fare_x_pass'] = titanic['fare'] / (titanic['sibsp'] + titanic['parch'] + titanic['alone']).astype(float)
    # create a single scaler object
    scaler = MinMaxScaler()
    # fit the single scaler just once to train
    scaler.fit(train[continuous_features])
    # apply that transformation to all three data sets
    # using the same syntax, which conconates
    # '_scaled' to each feature name that was fed in
    for df in [train, val, test]:
        df[[continuous_features[0]+'_scaled', continuous_features[1]+'_scaled']] = \
        scaler.transform(df[continuous_features])
    for df in [train, val, test]:
        df['is_female'] = np.where(df['sex'] == 'female', 1, 0)
    preprocessed_dfs = []
    for df in [train, val, test]:
        preprocessed_dfs.append(df.drop(columns=['sex', 'age', 'fare', 'embark_town']))
    return preprocessed_dfs

In [41]:
train_processed, val_processed, test_processed = preprocess_titanic(train, val, test)

AttributeError: 'DataFrame' object has no attribute 'embark_town'

#### 4. Create a function named `preprocess_titanic` that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.

## Using the Telco dataset

#### 1. Use the function defined in `acquire.py` to load the Telco data.

#### 2. Use the function defined in `prepare.py` to prepare the Telco data.

#### 3. Encode the categorical columns on train.
- Encode at least one column using `.replace`
- Encode at least one column using `.map`
- Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.


#### 4. Repeat the same steps on validate and test.

#### 5. Create a function named `prep_telco` that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.