# Data Preparation

### Iris Data

- Use the function defined in acquire.py to load the iris data.

In [5]:
import warnings
warnings.filterwarnings("ignore")

import acquire
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [6]:
df_iris = acquire.get_iris_data()

In [None]:
# Drop the species_id and measurement_id columns.

# df_iris.drop(['species_id', 'measurement_id'], axis=1, inplace=True)

In [None]:
# Rename the species_name column to just species.

# df_iris.rename(columns={'species_name': 'species'}, inplace=True)

In [None]:
# Encode the species name using a sklearn label encoder. 
# Research the inverse_transform method of the label encoder. 
# How might this be useful? xforms back to original label

# le = preprocessing.LabelEncoder()
# le.fit(df_iris['species'])
# df_iris['species'] = le.transform(df_iris['species'])


In [None]:
# df_iris.head()

In [7]:
# Create a function named prep_iris that accepts the untransformed iris data, 
# and returns the data with the transformations above applied.

def prep_iris(df):
    df_iris.drop(['species_id', 'measurement_id'], axis=1, inplace=True)
    df_iris.rename(columns={'species_name': 'species'}, inplace=True)
    le = preprocessing.LabelEncoder()
    le.fit(df_iris['species'])
    df_iris['species_encoded'] = le.transform(df_iris['species'])
    return df


In [8]:
prep_iris(df_iris)
df_iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_encoded
0,setosa,5.1,3.5,1.4,0.2,0
1,setosa,4.9,3.0,1.4,0.2,0
2,setosa,4.7,3.2,1.3,0.2,0
3,setosa,4.6,3.1,1.5,0.2,0
4,setosa,5.0,3.6,1.4,0.2,0


# Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.

- Write the code to perform the operations below. (Do this yourself, don't copy from the curriculum.)



In [9]:
df_titanic = acquire.get_titanic_data()

In [None]:
df_titanic.head()

In [None]:
# Handle the missing values in the embark_town and embarked columns.

# df_titanic.embark_town.fillna('Other', inplace=True)
# df_titanic.embarked.fillna('O', inplace=True)

In [None]:
df_titanic.isnull().sum()

In [None]:
# Remove the deck column.

# df_titanic.drop(columns='deck', inplace=True)

In [None]:
df_titanic.head()

In [None]:
# Use a label encoder to transform the embarked column.

# le_titanic = preprocessing.LabelEncoder()
# le_titanic.fit(df_titanic['embarked'])
# df_titanic['embarked'] = le_titanic.transform(df_titanic['embarked'])

In [None]:
df_titanic.head()

In [None]:
# Splitting before we can scale.

# train, test = train_test_split(df_titanic, train_size=.7)

In [None]:
# Scale the age and fare columns using a min max scaler. 
# Why might this be beneficial? 
# When might you not want to do this?

# mm_scaler = preprocessing.MinMaxScaler()
# mm_scaler.fit(train[['age', 'fare']])

# train[['age', 'fare']] = mm_scaler.transform(train[['age', 'fare']])
# test[['age', 'fare']] = mm_scaler.transform(test[['age', 'fare']])


In [10]:
# Create a function named prep_titanic that accepts the untransformed titanic data, 
# and returns the data with the transformations above applied.

def prep_titanic(df):
    df_titanic.embark_town.fillna('Other', inplace=True)
    df_titanic.embarked.fillna('O', inplace=True)
    df_titanic.drop(columns='deck', inplace=True)
    le_titanic = preprocessing.LabelEncoder()
    le_titanic.fit(df_titanic['embarked'])
    df_titanic['embarked_encode'] = le_titanic.transform(df_titanic['embarked'])
    train, test = train_test_split(df_titanic, test_size=.3, random_state=123, stratify=df_titanic[['survived']])
    mm_scaler = preprocessing.MinMaxScaler()
    mm_scaler.fit(train[['age', 'fare']])
    train[['age', 'fare']] = mm_scaler.transform(train[['age', 'fare']])
    test[['age', 'fare']] = mm_scaler.transform(test[['age', 'fare']])
    return df, train, test


In [11]:
df_titanic, train_df, test_df = prep_titanic(df_titanic)
df_titanic.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
df_titanic.isnull().sum()

In [None]:
# def handle_missing_values(df):
#     return df_titanic.assign(
#         embark_town=df_titanic.embark_town.fillna('Other'),
#         embarked=df_titanic.embarked.fillna('O'),
#     )

# def remove_columns(df):
#     return df_titanic.drop(columns=['deck'])

# def encode_embarked(df):
#     encoder = LabelEncoder()
#     encoder.fit(df_titanic.embarked)
#     return df_titanic.assign(embarked_encode = encoder.transform(df_titanic.embarked))

# def prep_titanic_data(df):
#     df_titanic = df\
#         .pipe(handle_missing_values)\
#         .pipe(remove_columns)\
#         .pipe(encode_embarked)
#     return df

In [None]:
# df_titanic = prep_titanic_data(df_titanic)
# df_titanic.head()