# Data Preparation

### Iris Data

- Use the function defined in acquire.py to load the iris data.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import acquire
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
df_iris = acquire.get_iris_data()

In [3]:
# Drop the species_id and measurement_id columns.

# df_iris.drop(['species_id', 'measurement_id'], axis=1, inplace=True)

In [4]:
# Rename the species_name column to just species.

# df_iris.rename(columns={'species_name': 'species'}, inplace=True)

In [5]:
# Encode the species name using a sklearn label encoder. 
# Research the inverse_transform method of the label encoder. 
# How might this be useful? xforms back to original label

# le = preprocessing.LabelEncoder()
# le.fit(df_iris['species'])
# df_iris['species'] = le.transform(df_iris['species'])


In [6]:
# df_iris.head()

In [7]:
# Create a function named prep_iris that accepts the untransformed iris data, 
# and returns the data with the transformations above applied.

def prep_iris(df):
    df_iris.drop(['species_id', 'measurement_id'], axis=1, inplace=True)
    df_iris.rename(columns={'species_name': 'species'}, inplace=True)
    le = preprocessing.LabelEncoder()
    le.fit(df_iris['species'])
    df_iris['species'] = le.transform(df_iris['species'])
    return df


In [8]:
prep_iris(df_iris)
df_iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


# Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.

- Write the code to perform the operations below. (Do this yourself, don't copy from the curriculum.)



In [9]:
df_titanic = acquire.get_titanic_data()

In [10]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [11]:
# Handle the missing values in the embark_town and embarked columns.

# df_titanic.embark_town.fillna('Other', inplace=True)
# df_titanic.embarked.fillna('O', inplace=True)

In [12]:
df_titanic.isnull().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [13]:
# Remove the deck column.

# df_titanic.drop(columns='deck', inplace=True)

In [14]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [15]:
# Use a label encoder to transform the embarked column.

# le_titanic = preprocessing.LabelEncoder()
# le_titanic.fit(df_titanic['embarked'])
# df_titanic['embarked'] = le_titanic.transform(df_titanic['embarked'])

In [16]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [17]:
# Splitting before we can scale.

# train, test = train_test_split(df_titanic, train_size=.7)

In [18]:
# Scale the age and fare columns using a min max scaler. 
# Why might this be beneficial? 
# When might you not want to do this?

# mm_scaler = preprocessing.MinMaxScaler()
# mm_scaler.fit(train[['age', 'fare']])

# train[['age', 'fare']] = mm_scaler.transform(train[['age', 'fare']])
# test[['age', 'fare']] = mm_scaler.transform(test[['age', 'fare']])


In [19]:
# Create a function named prep_titanic that accepts the untransformed titanic data, 
# and returns the data with the transformations above applied.

def prep_titanic(df):
    df_titanic.embark_town.fillna('Other', inplace=True)
    df_titanic.embarked.fillna('O', inplace=True)
    df_titanic.drop(columns='deck', inplace=True)
    le_titanic = preprocessing.LabelEncoder()
    le_titanic.fit(df_titanic['embarked'])
    df_titanic['embarked'] = le_titanic.transform(df_titanic['embarked'])
    train, test = train_test_split(df_titanic, train_size=.7)
    mm_scaler = preprocessing.MinMaxScaler()
    mm_scaler.fit(train[['age', 'fare']])
    train[['age', 'fare']] = mm_scaler.transform(train[['age', 'fare']])
    test[['age', 'fare']] = mm_scaler.transform(test[['age', 'fare']])
    return df, train, test


In [20]:
df_titanic, train_df, test_df = prep_titanic(df_titanic)
df_titanic.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,3,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,3,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,3,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,3,Third,Southampton,1


In [21]:
train_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
475,475,0,1,male,,0,0,0.101497,3,First,Southampton,1
619,619,0,2,male,0.321438,0,0,0.020495,3,Second,Southampton,1
57,57,0,3,male,0.352852,0,0,0.01411,0,Third,Cherbourg,1
858,858,1,3,female,0.296306,0,3,0.03759,0,Third,Cherbourg,0
886,886,0,2,male,0.334004,0,0,0.025374,3,Second,Southampton,1


In [22]:
test_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
167,167,0,3,female,0.560191,1,4,0.054457,3,Third,Southampton,0
643,643,1,3,male,,0,0,0.110272,3,Third,Southampton,1
346,346,1,2,female,0.497361,0,0,0.025374,3,Second,Southampton,1
58,58,1,2,female,0.057552,1,2,0.054164,3,Second,Southampton,0
499,499,0,3,male,0.296306,0,0,0.015216,3,Third,Southampton,1


In [23]:
df_titanic.isnull().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          0
class             0
embark_town       0
alone             0
dtype: int64