In [445]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [446]:
df = pd.read_csv('titanic_train.csv')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [447]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [448]:
df.dropna(subset=['Embarked'], inplace=True)


In [449]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [450]:
from sklearn.impute import SimpleImputer

In [451]:
imputer = SimpleImputer(strategy='median')
df_num = df.select_dtypes(include=['int64','float64'])



In [452]:
df_num.head()
df_num_fill_median = imputer.fit_transform(df_num)  

In [453]:
df_num_fill_median = pd.DataFrame(df_num_fill_median, columns=df_num.columns)  
df_num_fill_median.isnull().sum() 

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [454]:
y = df_num_fill_median['Survived']
df_num_fill_median.drop(['Survived'], axis=1, inplace=True)

In [455]:
df_cat = df.select_dtypes(['object'])
df_cat.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [456]:
from sklearn.preprocessing import OrdinalEncoder

cat_encoder = OrdinalEncoder()
df_cat_encoded = cat_encoder.fit_transform(df_cat)
cat_encoder.categories_ 

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [457]:
df_cat_encoded = pd.DataFrame(df_cat_encoded, columns=df_cat.columns) 
titanic_train_encoded = pd.concat([df_num_fill_median, df_cat_encoded], axis=1)
titanic_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    889 non-null    float64
 1   Age       889 non-null    float64
 2   SibSp     889 non-null    float64
 3   Parch     889 non-null    float64
 4   Fare      889 non-null    float64
 5   Sex       889 non-null    float64
 6   Embarked  889 non-null    float64
dtypes: float64(7)
memory usage: 48.7 KB


In [458]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder()
df_cat_onehot_encoded = onehot_encoder.fit_transform(df_cat)
onehot_encoder.categories_


[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [459]:
column_names = onehot_encoder.get_feature_names()
column_names

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [460]:
df_cat_onehot_encoded = df_cat_onehot_encoded.toarray()

In [461]:
df_cat_onehot_encoded = pd.DataFrame(df_cat_onehot_encoded, columns=column_names)
df_cat_onehot_encoded.head()

Unnamed: 0,x0_female,x0_male,x1_C,x1_Q,x1_S
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0


In [462]:
titanic_train_onehot_encoded = pd.concat([df_num_fill_median, df_cat_onehot_encoded], axis=1)
titanic_train_onehot_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     889 non-null    float64
 1   Age        889 non-null    float64
 2   SibSp      889 non-null    float64
 3   Parch      889 non-null    float64
 4   Fare       889 non-null    float64
 5   x0_female  889 non-null    float64
 6   x0_male    889 non-null    float64
 7   x1_C       889 non-null    float64
 8   x1_Q       889 non-null    float64
 9   x1_S       889 non-null    float64
dtypes: float64(10)
memory usage: 69.6 KB


In [463]:
from sklearn.tree import DecisionTreeClassifier
tree_clf1 = DecisionTreeClassifier(criterion='entropy')
tree_clf1.fit(titanic_train_encoded, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [464]:
tree_clf2 = DecisionTreeClassifier(criterion='entropy')
tree_clf2.fit(titanic_train_onehot_encoded, y)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [465]:
tree_clf3 = DecisionTreeClassifier(criterion='entropy',max_depth=4)
tree_clf3.fit(titanic_train_onehot_encoded, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [466]:
df_test = pd.read_csv('titanic_test.csv')
df_test.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [467]:
df_test_id = df_test['PassengerId']

In [468]:
df_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [469]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [470]:
df_test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [471]:
mean_imputer = SimpleImputer(strategy='mean')
df_test_num = df_test.select_dtypes(include=['int64','float64'])
df_test_num_fill_mean = mean_imputer.fit_transform(df_test_num)  
df_test_num_fill_mean = pd.DataFrame(df_test_num_fill_mean, columns=df_test_num.columns)  
df_test_num_fill_mean.isnull().sum()

Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64

In [472]:
df_test_num_fill_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    float64
 1   Age     418 non-null    float64
 2   SibSp   418 non-null    float64
 3   Parch   418 non-null    float64
 4   Fare    418 non-null    float64
dtypes: float64(5)
memory usage: 16.5 KB


In [473]:
df_test_cat = df_test.select_dtypes(include=['object'])
df_test_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sex       418 non-null    object
 1   Embarked  418 non-null    object
dtypes: object(2)
memory usage: 6.7+ KB


In [474]:
df_test_cat_encoded = cat_encoder.fit_transform(df_test_cat)
df_test_cat_encoded = pd.DataFrame(df_test_cat_encoded, columns=df_test_cat.columns)

In [475]:
titanic_test_encoded = pd.concat([df_test_num_fill_mean, df_test_cat_encoded], axis=1)
titanic_test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    float64
 1   Age       418 non-null    float64
 2   SibSp     418 non-null    float64
 3   Parch     418 non-null    float64
 4   Fare      418 non-null    float64
 5   Sex       418 non-null    float64
 6   Embarked  418 non-null    float64
dtypes: float64(7)
memory usage: 23.0 KB


In [476]:
y_hat_tree1 = tree_clf1.predict(titanic_test_encoded)
tree1_submit = pd.DataFrame({
    'PassengerId': df_test_id, 
    'Survived': y_hat_tree1.astype(int),
})
tree1_submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [477]:
tree1_submit.to_csv('tree1_submit.csv', index=False) 


In [478]:
df_test_cat_onehot_encoded = onehot_encoder.fit_transform(df_test_cat)
column_names = onehot_encoder.get_feature_names()
df_test_cat_onehot_encoded = df_test_cat_onehot_encoded.toarray()
df_test_cat_onehot_encoded = pd.DataFrame(df_test_cat_onehot_encoded, columns=column_names)
titanic_test_onehot_encoded = pd.concat([df_test_num_fill_mean, df_test_cat_onehot_encoded], axis=1)
titanic_test_onehot_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     418 non-null    float64
 1   Age        418 non-null    float64
 2   SibSp      418 non-null    float64
 3   Parch      418 non-null    float64
 4   Fare       418 non-null    float64
 5   x0_female  418 non-null    float64
 6   x0_male    418 non-null    float64
 7   x1_C       418 non-null    float64
 8   x1_Q       418 non-null    float64
 9   x1_S       418 non-null    float64
dtypes: float64(10)
memory usage: 32.8 KB


In [479]:
y_hat_tree2 = tree_clf2.predict(titanic_test_onehot_encoded)
tree2_submit = pd.DataFrame({
    'PassengerId': df_test_id, 
    'Survived': y_hat_tree2.astype(int),
})
tree2_submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [480]:
y_hat_tree3 = tree_clf3.predict(titanic_test_onehot_encoded)
tree3_submit = pd.DataFrame({
    'PassengerId': df_test_id, 
    'Survived': y_hat_tree3.astype(int),
})
tree3_submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [481]:
tree2_submit.to_csv('tree2_submit.csv', index=False) 
tree3_submit.to_csv('tree3_submit.csv', index=False) 
