# Data Preparation

### Iris Data

- Use the function defined in acquire.py to load the iris data.

In [1]:
import acquire
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [2]:
df_iris = acquire.get_iris_data()

In [None]:
# Drop the species_id and measurement_id columns.

df_iris.drop(['species_id', 'measurement_id'], axis=1, inplace=True)

In [None]:
# Rename the species_name column to just species.
df_iris.rename(columns={'species_name': 'species'}, inplace=True)

In [None]:
# Encode the species name using a sklearn label encoder. 
# Research the inverse_transform method of the label encoder. 
# How might this be useful? xforms back to original label

le = preprocessing.LabelEncoder()
le.fit(df_iris['species'])
df_iris['species'] = le.transform(df_iris['species'])


In [None]:
df_iris.head()

In [3]:
# Create a function named prep_iris that accepts the untransformed iris data, 
# and returns the data with the transformations above applied.

def prep_iris(df):
    df_iris.drop(['species_id', 'measurement_id'], axis=1, inplace=True)
    df_iris.rename(columns={'species_name': 'species'}, inplace=True)
    le = preprocessing.LabelEncoder()
    le.fit(df_iris['species'])
    df_iris['species'] = le.transform(df_iris['species'])
    return df


In [4]:
prep_iris(df_iris)
df_iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


# Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.

- Write the code to perform the operations below. (Do this yourself, don't copy from the curriculum.)



In [5]:
df_titanic = acquire.get_titanic_data()

In [None]:
df_titanic.head()

In [None]:
# Handle the missing values in the embark_town and embarked columns.

df_titanic.embark_town.fillna('Other', inplace=True)
df_titanic.embarked.fillna('O', inplace=True)

In [None]:
df_titanic.isnull().sum()

In [None]:
# Remove the deck column.

df_titanic.drop(columns='deck', inplace=True)

In [None]:
df_titanic.head()

In [None]:
# Use a label encoder to transform the embarked column.

le_titanic = preprocessing.LabelEncoder()
le_titanic.fit(df_titanic['embarked'])
df_titanic['embarked'] = le_titanic.transform(df_titanic['embarked'])

In [None]:
df_titanic.head()

In [None]:
# Splitting before we can scale.

train, test = train_test_split(df_titanic, train_size=.7)

In [None]:
train.head()

In [None]:
# Scale the age and fare columns using a min max scaler. 
# Why might this be beneficial? 
# When might you not want to do this?

mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(train[['age', 'fare']])

train[['age', 'fare']] = mm_scaler.transform(train[['age', 'fare']])
test[['age', 'fare']] = mm_scaler.transform(test[['age', 'fare']])


In [None]:
train.head()

In [6]:
# Create a function named prep_titanic that accepts the untransformed titanic data, 
# and returns the data with the transformations above applied.

def prep_titanic(df):
    df_titanic.embark_town.fillna('Other', inplace=True)
    df_titanic.embarked.fillna('O', inplace=True)
    df_titanic.drop(columns='deck', inplace=True)
    le_titanic = preprocessing.LabelEncoder()
    le_titanic.fit(df_titanic['embarked'])
    df_titanic['embarked'] = le_titanic.transform(df_titanic['embarked'])
    train, test = train_test_split(df_titanic, train_size=.7)
    mm_scaler = preprocessing.MinMaxScaler()
    mm_scaler.fit(train[['age', 'fare']])
    train[['age', 'fare']] = mm_scaler.transform(train[['age', 'fare']])
    test[['age', 'fare']] = mm_scaler.transform(test[['age', 'fare']])
    return df, train, test


In [7]:
df_titanic, train, test = prep_titanic(df_titanic)
df_titanic.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-doc

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,3,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,3,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,3,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,3,Third,Southampton,1


In [8]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
500,500,0,3,male,0.225333,0,0,0.016908,3,Third,Southampton,1
883,883,0,2,male,0.37483,0,0,0.020495,3,Second,Southampton,1
539,539,1,1,female,0.293286,0,2,0.096618,0,First,Cherbourg,0
810,810,0,3,male,0.347649,0,0,0.015395,3,Third,Southampton,1
372,372,0,3,male,0.252514,0,0,0.015713,3,Third,Southampton,1


In [9]:
test.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
440,440,1,2,female,0.605871,1,1,0.051237,3,Second,Southampton,0
553,553,1,3,male,0.293286,0,0,0.014102,0,Third,Cherbourg,1
764,764,0,3,male,0.211742,0,0,0.015176,3,Third,Southampton,1
520,520,1,1,female,0.402011,0,0,0.1825,3,First,Southampton,1
307,307,1,1,female,0.225333,1,0,0.212559,0,First,Cherbourg,0
