In [4]:
import pandas as pd
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
from pandas_profiling import ProfileReport
profile = ProfileReport(
    df, title ="Pandas Profiling Report for Titanic train dataset"
)
profile.to_file("titanic_train_profiling.html")

Summarize dataset:  24%|██▎       | 4/17 [00:00<00:00, 53.46it/s, Describe variable:PassengerId]

Summarize dataset: 100%|██████████| 47/47 [00:02<00:00, 15.79it/s, Completed]                       
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 491.19it/s]


# Preprocessing

## Drop columns and fill in missing values

In [13]:
df_forml = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
df_forml.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


In [10]:
# Fill with value that exist the most
df_forml['Embarked'].iloc[61] = 'S'
df_forml['Embarked'].iloc[829] = 'S'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_forml['Embarked'].iloc[61] = 'S'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_forml['Embarked'].iloc[829] = 'S'


In [11]:
df_forml.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


## Feature engineering



In [12]:
df_forml = df_forml.join(pd.get_dummies(df_forml.Embarked, prefix='Embarked'))
df_forml = df_forml.join(pd.get_dummies(df_forml.Sex, columns=['male','female']))
df_forml

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S,female,male
0,0,3,male,22.0,1,0,7.2500,S,0,0,1,0,1
1,1,1,female,38.0,1,0,71.2833,C,1,0,0,1,0
2,1,3,female,26.0,0,0,7.9250,S,0,0,1,1,0
3,1,1,female,35.0,1,0,53.1000,S,0,0,1,1,0
4,0,3,male,35.0,0,0,8.0500,S,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,0,0,1,0,1
887,1,1,female,19.0,0,0,30.0000,S,0,0,1,1,0
888,0,3,female,,1,2,23.4500,S,0,0,1,1,0
889,1,1,male,26.0,0,0,30.0000,C,1,0,0,0,1


In [15]:
df_forml.drop(columns=['Sex','Embarked'], inplace=True)
df_forml.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [None]:
df_forml['Age'] = df_forml['Age']/(df_forml['Age'].max())
df_forml['Fare'] = df_forml['Fare']/(df_forml['Fare'].max())

# Split data and Train model

In [None]:
from sklearn.model_selection import train_test_split
X = df_forml.drop(columns=['Survived'])
y = df_forml['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)