## **Importing Packages**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


**Testing data loading step**

In [2]:
#Importing the testing dataset
X_test_full = pd.read_csv("/kaggle/input/titanic/test.csv")

X_test_full.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


**Training data loading step. We are at a point where we split the training and validation data in train_X, val_X, train_y, val_y.**

Next, we will experiment with one-hot encoding.  But, instead of encoding all of the categorical variables in the dataset, we will only create a one-hot encoding for columns with cardinality less than 10.

Run the code cell below without changes to set `low_cardinality_cols` to a Python list containing the columns that will be one-hot encoded.  Likewise, `high_cardinality_cols` contains a list of categorical columns that will be dropped from the dataset.

In [3]:
#Importing the training dataset
from sklearn.model_selection import train_test_split

#Declaring X and y vairiables
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")

# Remove rows with missing target, separate target from predictors
y = train_data["Survived"]
X = train_data.drop(['Survived'], axis=1)

# Divide data into training and validation subsets
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we split
train_X, val_X, train_y, val_y = train_test_split(X, y,train_size=0.8, test_size=0.2,random_state = 0)


# "Cardinality" means the number of unique values in a column
# Get list of categorical variables
s = (train_X.dtypes == 'object')
object_cols = list(s[s].index)
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [col for col in object_cols if train_X[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)


#Or you can do this if you know the columns to drop
# X.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

# Select numerical columns
numerical_cols = [cname for cname in train_X.columns if train_X[cname].dtype in ['int64', 'float64']]
print('\nNumerical Columns from the dataset:', numerical_cols,'\n')

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols  
# my_cols = train_data['Cabin']
X_train = train_X[my_cols].copy()
X_valid = val_X[my_cols].copy()


# # Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: train_X[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# # # Print number of unique entries by column, in ascending order
print('Number of Unique entries :',sorted(d.items(), key=lambda x: x[1]))

Categorical columns that will be one-hot encoded: ['Sex', 'Embarked']

Categorical columns that will be dropped from the dataset: ['Cabin', 'Name', 'Ticket']

Numerical Columns from the dataset: ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 

Number of Unique entries : [('Sex', 2), ('Embarked', 3), ('Cabin', 127), ('Ticket', 569), ('Name', 712)]


We take a peek at the training data with the head() method below.

In [4]:
X_train.head()

Unnamed: 0,Sex,Embarked,PassengerId,Pclass,Age,SibSp,Parch,Fare
140,female,C,141,3,,0,2,15.2458
439,male,S,440,2,31.0,0,0,10.5
817,male,C,818,2,31.0,1,1,37.0042
378,male,C,379,3,20.0,0,0,4.0125
491,male,S,492,3,21.0,0,0,7.25


**Using Ordinal Encoding**
This is a common problem that you'll encounter with real-world data, and there are many approaches to fixing this issue.  For instance, you can write a custom ordinal encoder to deal with new categories.  The simplest approach, however, is to drop the problematic categorical columns.  

Run the code cell below to save the problematic columns to a Python list `bad_label_cols`.  Likewise, columns that can be safely ordinal encoded are stored in `good_label_cols`.

In [5]:
# Columns that can be safely ordinal encoded
good_label_cols = low_cardinality_cols
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)
print("\nCategorical variables:")
print(object_cols)

Categorical columns that will be ordinal encoded: ['Sex', 'Embarked']

Categorical columns that will be dropped from the dataset: ['Cabin', 'Name', 'Ticket']

Categorical variables:
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


**Define Function to Measure Quality of Each Approach**

We define a function score_dataset() to compare the three different approaches to dealing with categorical variables. This function reports the mean absolute error (MAE) from a random forest model. In general, we want the MAE to be as low as possible!

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
# Function for comparing different approaches
def score_dataset(X_train, X_valid, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=100, n_estimators=10, random_state=1)
    model.fit(X_train, train_y)
    preds = model.predict(X_valid)
    return mean_absolute_error(val_y, preds)

**Before jumping into encoding, we'll investigate the dataset.  Specifically, we'll look at the `'Condition2'` column.  The code cell below prints the unique entries in both the training and validation sets**.

In [7]:
# print("Unique values in 'Condition2' column in training data:", X_train['Condition2'].unique())
# print("\nUnique values in 'Condition2' column in validation data:", X_valid['Condition2'].unique())

### Ordinal Encoding 

Use the next code cell to ordinal encode the data in `X_train` and `X_valid`.  Set the preprocessed DataFrames to `label_X_train` and `label_X_valid`, respectively.  
- We have provided code below to drop the categorical columns in `bad_label_cols` from the dataset. 
- You should ordinal encode the categorical columns in `good_label_cols`.  

In [8]:
from sklearn.preprocessing import OrdinalEncoder

# Drop categorical columns that will not be encoded
label_X_train = train_X.drop(bad_label_cols, axis=1)
label_X_valid = val_X.drop(bad_label_cols, axis=1)

# Apply ordinal encoder 
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])# Your code here   

In [9]:
# Get names of columns with missing values in Xtrain
cols_with_missing = [col for col in label_X_train.columns
                     if label_X_train[col].isnull().any()]

# Shape of training data (num_rows, num_columns)
print(label_X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (label_X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

# Get names of columns with missing values in Valid_X
cols_with_missing = [col for col in label_X_valid.columns
                     if label_X_valid[col].isnull().any()]

# Shape of training data (num_rows, num_columns)
print('\n',label_X_valid.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (label_X_valid.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0],'\n')

(712, 8)
Age         141
Embarked      2
dtype: int64

 (179, 8)
Age    36
dtype: int64 



### Check Absolute Mean Error for Ordinal Encoder

In [10]:
#For Handling Missing Values 
from sklearn.impute import SimpleImputer
# Imputation
my_imputer = SimpleImputer()
imputed_train = pd.DataFrame(my_imputer.fit_transform(label_X_train))
imputed_valid = pd.DataFrame(my_imputer.transform(label_X_valid))

# Imputation removed column names; put them back
imputed_train.columns = label_X_train.columns
imputed_valid.columns = label_X_valid.columns

print("MAE from Approach (Imputation):")
print(score_dataset(imputed_train, imputed_valid, train_y, val_y))

MAE from Approach (Imputation):
0.23626024995539335


**(One-Hot Encoding)**

We use the OneHotEncoder class from scikit-learn to get one-hot encodings. There are a number of parameters that can be used to customize its behavior.

We set handle_unknown='ignore' to avoid errors when the validation data contains classes that aren't represented in the training data, and
setting sparse=False ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).
To use the encoder, we supply only the categorical columns that we want to be one-hot encoded. For instance, to encode the training data, we supply X_train[object_cols]. (object_cols in the code cell below is a list of the column names with categorical data, and so X_train[object_cols] contains all of the categorical data in the training set.)

In [11]:
#Selecting Training Data with categories using oneHot encoder and splitting to train and validation 
from sklearn.preprocessing import OneHotEncoder
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
# features = ["Pclass", "Sex", "SibSp", "Parch", 'Embarked','Age']
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
print(X_train.shape)

print(OH_X_valid.shape)

(712, 8)
(179, 12)


**Getting missing values colums from training and validations set**

In [12]:
# Get names of columns with missing values in Xtrain
cols_with_missing = [col for col in OH_X_train.columns
                     if OH_X_train[col].isnull().any()]

# Shape of training data (num_rows, num_columns)
print(OH_X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (OH_X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(712, 12)
Age    141
dtype: int64


In [13]:
# Get names of columns with missing values in Valid_X
cols_with_missing = [col for col in OH_X_valid.columns
                     if OH_X_valid[col].isnull().any()]

# Shape of training data (num_rows, num_columns)
print(OH_X_valid.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (OH_X_valid.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(179, 12)
Age    36
dtype: int64


### Check Absolute Mean Error for OneHot Encoder

**Using simple imputer from sklearn to impute values for Xtrain and Xvalid** 

In [14]:
#For Handling Missing Values 
from sklearn.impute import SimpleImputer
# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = OH_X_train.columns
imputed_X_valid.columns = OH_X_valid.columns

print("MAE from Approach (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, train_y, val_y))

MAE from Approach (Imputation):
0.2279342323975322




## Data Handling for Xtest Predictions 

In [15]:
#Selecting X_test data without categorical variables 
X_test = X_test_full

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)

Ts = (X_test.dtypes == 'object')
object_test_cols = list(Ts[Ts].index)
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
Tlow_cardinality_cols = [col for col in object_test_cols if X_test[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_test_cols)-set(Tlow_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)


#Or you can do this if you know the columns to drop
# X.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

# Select numerical columns
numerical_cols = [cname for cname in X_test.columns if X_test[cname].dtype in ['int64', 'float64']]

print('\nNumerical Columns from the dataset:', numerical_cols,'\n')

# Keep selected columns only
test_cols = Tlow_cardinality_cols + numerical_cols
X_test_Run = X_test[test_cols].copy()

print("Categorical variables:")
print(object_test_cols,'\n')

# # Get number of unique entries in each column with categorical data
object_unique = list(map(lambda col: X_test[col].nunique(), object_cols))
d = dict(zip(object_cols, object_unique))

# # Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

Categorical columns that will be one-hot encoded: ['Sex', 'Embarked']

Categorical columns that will be dropped from the dataset: ['Cabin', 'Name', 'Ticket']

Numerical Columns from the dataset: ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 

Categorical variables:
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'] 



[('Sex', 2), ('Embarked', 3)]

## Using one Hot Encoder for Test Dataset

In [16]:
# Apply one-hot encoder to each column with categorical data
Ts = (X_test_Run.dtypes == 'object')
object_test_cols = list(Ts[Ts].index)
TOH_cols_test = pd.DataFrame(OH_encoder.fit_transform(X_test_Run[object_test_cols]))

# One-hot encoding removed index; put it back
TOH_cols_test.index = X_test_Run.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_test = X_test_Run.drop(object_test_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_test = pd.concat([num_X_test, TOH_cols_test], axis=1)
OH_X_test['5'] = np.zeros(OH_X_test.shape[0])
OH_X_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4,5
0,892,3,34.5,0,0,7.8292,0.0,1.0,0.0,1.0,0.0,0.0
1,893,3,47.0,1,0,7.0,1.0,0.0,0.0,0.0,1.0,0.0
2,894,2,62.0,0,0,9.6875,0.0,1.0,0.0,1.0,0.0,0.0
3,895,3,27.0,0,0,8.6625,0.0,1.0,0.0,0.0,1.0,0.0
4,896,3,22.0,1,1,12.2875,1.0,0.0,0.0,0.0,1.0,0.0


In [17]:
test_imputer = SimpleImputer()

#Checking for missing values
cols_with_missing = [col for col in OH_X_test.columns
                     if OH_X_test[col].isnull().any()]

# Shape of training data (num_rows, num_columns)
print(OH_X_test.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (OH_X_test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#Imputing data to the missing numerical data of X_test 
final_X_test = pd.DataFrame(test_imputer.fit_transform(OH_X_test))

(418, 12)
Age     86
Fare     1
dtype: int64




In [18]:
#Prediction of X_Test
model = RandomForestRegressor(max_leaf_nodes=100, n_estimators=10,random_state=1)
model.fit(imputed_X_valid, val_y)
preds = model.predict(final_X_test)
preds = (np.floor(preds)).astype(int)
preds






array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [19]:
output = pd.DataFrame({'PassengerId': X_test_full.PassengerId, 'Survived': preds})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
preds

Your submission was successfully saved!


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,