Prepare Environment

In [1]:
import pandas as pd
import numpy as np
import acquire
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

Acquire Data

In [2]:
df = acquire.get_titanic_data()

In [4]:
df_iris = acquire.get_iris_data()

Summarize

- info
- describe
- head/tail
- value_counts
- shape
- isnull

In [None]:
df.info()

In [None]:
df. describe()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe().transpose()

In [None]:
df.age.value_counts(bins=10, sort = False)

Prepare Data

- drop columns
- fillna
- split
- impute mean, mode, median: SimpleImputer
- integer encodeing: LabelEncoder
- one hot encodeing: OneHotEncoder
- scale

In [None]:
# Deck values only has 203 values, and 688 nulls. We don't think we will be able to gather much information from that column so we will drop it.
df.drop(columns =['deck'], inplace = True)

In [None]:
# We will then fill our null values with 'np.nan'. Down the road this will cause less errors with functions, or we will be able to find the values causing errors more easily
df.fillna(np.nan, inplace=True)

In [None]:
# Then split data into train & test using scikit learn model selection function train_test_split.
train, test = train_test_split(df, train_size = .8, random_state = 123)

Impute mean, mode, median using SimpleImputer

In [None]:
train.embarked.value_counts(dropna=False)

We see that there are 2 NaN values in this data. We will fill them using the SimpleImputer, 'most_frequent'

In [None]:
# Create the SimpleImputer Object
imp_mode = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imp_mode

In [None]:
# Fit the SimpleImputer Object to the train data. This again creates an object.
imp_mode.fit(train[['embarked']])

In [None]:
# Transform the train data using the scaler object SimpleImputer that has been fit to train
train['embarked'] = imp_mode.transform(train[['embarked']])
train['embarked'].head()

In [None]:
# Transform the test data using the scaler object SimpleImputer that has been fit to train.
test['embarked'] = imp_mode.transform(test[['embarked']])
test['embarked'].head()

In [None]:
# Let's look at the value_counts() for age
train.age.value_counts(dropna=False).head(), test.age.value_counts(dropna=False).head()

There are 148 NaN values in the train, and 29 NaN values in the test. We will use the SimpleImputer to fill these with the median age.

In [None]:
# Create the SimpleImputer Object.
imp_median = SimpleImputer(missing_values = np.nan, strategy = 'median')

In [None]:
# We can fit and transform the test data in one step using fit_transform
train['age'] = imp_median.fit_transform(train[['age']])

In [None]:
train.age.value_counts().head()
# There are no NaN values in our test data. So we do not need to fill in any values with the SimpleImputer.

Encoding

1. Integer encoding
2. one hot encoding

In [None]:
# We want to make our categorical variable 'embarked' labeled with numbers instead of letters. Right now 'embarked' is an object datatype
# Create our encoder object
int_encoder = LabelEncoder()
int_encoder

In [None]:
# Fit the encoder object to our train data. 
int_encoder.fit(train.embarked)

In [None]:
# Tranform the data using the encoder object that has been fitted to the train data.
train.embarked = int_encoder.transform(train.embarked)
train.embarked.value_counts()
# This data is labeled by alpha S - 2, Q - 128, C - 67

We want to be able to create a data frame that will put our values for int_encoded data into rows.

In [None]:
# Create an 2-D array from the new train.embarked data to use in our transform for OneHotEncode.
embarked_array = np.array(train.embarked).reshape(len(train.embarked),1)
embarked_array[0:5]

In [None]:
# Create the OneHotEncoder Object
ohe = OneHotEncoder(sparse = False, categories = 'auto')
ohe

In [None]:
# Fit and transform the OnehotEncoder to the embarked_array that contains our train data.
embarked_ohe = ohe.fit_transform(embarked_array)

In [None]:
embarked_ohe

In [None]:
# We can also transform our test data into 3 columns for our categories.
# transform the test.embarked data into numerical labeled data.
test.embarked = int_encoder.transform(test.embarked)
test.embarked.head(20)

Make the test.embarked Series into a 2-D array to transform using OneHotEncoder

In [None]:
embarked_array = np.array(test.embarked).reshape(len(test.embarked),1)

In [None]:
embarked_test_ohe = ohe.transform(embarked_array)

In [None]:
embarked_test_ohe[0:5]