In [2]:
import pandas as pd
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
#select the target variable and the dependent variables

y = df["Survived"]
X = df.drop("Survived", axis=1)

In [6]:
#Do train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Preprocess the data

## Data Cleaning

In [8]:
# check missing values
X_train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            133
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          511
Embarked         2
dtype: int64

In [11]:
#make a copy of the data then fill null for cabin with cabin_missing and embarked with embarked_missing
X_train_fillna = X_train.copy()
X_train_fillna.fillna({"Cabin": "cabin_missing", "Embarked":"embarked_missing"}, inplace = True)
X_train_fillna.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            133
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [13]:
# Use simple imputer to fill missing values of age since it is a numeric fedature

from sklearn.impute import SimpleImputer

imputer = SimpleImputer()

imputer.fit(X_train_fillna[["Age"]])

age_imputed = pd.DataFrame(imputer.transform(X_train_fillna[["Age"]]),
                           index=X_train_fillna.index,
                           columns=["Age"])
X_train_fillna["Age"] = age_imputed
X_train_fillna.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Dealing with categorical Data

In [15]:
X_train_fillna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 105 to 684
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  668 non-null    int64  
 1   Pclass       668 non-null    int64  
 2   Name         668 non-null    object 
 3   Sex          668 non-null    object 
 4   Age          668 non-null    float64
 5   SibSp        668 non-null    int64  
 6   Parch        668 non-null    int64  
 7   Ticket       668 non-null    object 
 8   Fare         668 non-null    float64
 9   Cabin        668 non-null    object 
 10  Embarked     668 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 62.6+ KB


In [16]:
# colums with categorical data

X_train_categorical = X_train_fillna.select_dtypes(exclude=["int64", "float64"]).copy()
X_train_categorical

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
105,"Mionoff, Mr. Stoytcho",male,349207,cabin_missing,S
68,"Andersson, Miss. Erna Alexandra",female,3101281,cabin_missing,S
253,"Lobb, Mr. William Arthur",male,A/5. 3336,cabin_missing,S
320,"Dennis, Mr. Samuel",male,A/5 21172,cabin_missing,S
706,"Kelly, Mrs. Florence ""Fannie""",female,223596,cabin_missing,S
...,...,...,...,...,...
835,"Compton, Miss. Sara Rebecca",female,PC 17756,E49,C
192,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,350046,cabin_missing,S
629,"O'Connell, Mr. Patrick D",male,334912,cabin_missing,Q
559,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,345572,cabin_missing,S


In [18]:
# use OneHotEncoder to convert categorical values into dummy onehotencoded variables
from sklearn.preprocessing import OneHotEncoder
import numpy as np

ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
ohe.fit(X_train_categorical)
X_train_ohe = pd.DataFrame(
    ohe.transform(X_train_categorical),
    # index is important to ensure we can concatenate with other columns
    index=X_train_categorical.index,
    # we are dummying multiple columns at once, so stack the names
    columns=np.hstack(ohe.categories_)
)
X_train_ohe

Unnamed: 0,"Abbing, Mr. Anthony","Abbott, Mr. Rossmore Edward","Abelson, Mrs. Samuel (Hannah Wizosky)","Adahl, Mr. Mauritz Nils Martin","Adams, Mr. John","Aks, Mrs. Sam (Leah Rosen)","Albimona, Mr. Nassef Cassem","Alexander, Mr. William","Alhomaki, Mr. Ilmari Rudolf","Allen, Miss. Elisabeth Walton",...,F33,F38,F4,G6,T,cabin_missing,C,Q,S,embarked_missing
105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
