Open Machine Learning Course 

**<a href="https://www.kaggle.com/c/titanic">Competition</a> Kaggle "Titanic: Machine Learning from Disaster".**

In [1]:
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

**Read data**

In [2]:
try:
    df = pd.read_csv("titanic_train.csv", index_col="PassengerId")
except:
    url = 'https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/train.csv'
    df = pd.read_csv(url, index_col="PassengerId")

In [3]:
df.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [5]:
df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

In [6]:
df[df.isnull().any(1)]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,0,3,male,,0,0,8.4583
18,1,2,male,,0,0,13.0000
20,1,3,female,,0,0,7.2250
27,0,3,male,,0,0,7.2250
29,1,3,female,,0,0,7.8792
...,...,...,...,...,...,...,...
860,0,3,male,,0,0,7.2292
864,0,3,female,,8,2,69.5500
869,0,3,male,,0,0,9.5000
879,0,3,male,,0,0,7.8958


In [7]:
df.dropna(axis=0, inplace=True)

In [8]:
df[df.isnull().any(1)]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 1 to 891
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Sex       714 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     714 non-null    int64  
 5   Parch     714 non-null    int64  
 6   Fare      714 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 44.6+ KB


In [10]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

In [11]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,0,22.0,1,0,7.25
2,1,1,1,38.0,1,0,71.2833
3,1,3,1,26.0,0,0,7.925
4,1,1,1,35.0,1,0,53.1
5,0,3,0,35.0,0,0,8.05


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 1 to 891
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Sex       714 non-null    int64  
 3   Age       714 non-null    float64
 4   SibSp     714 non-null    int64  
 5   Parch     714 non-null    int64  
 6   Fare      714 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 44.6 KB


# Lerning

In [13]:
y = df['Survived'].astype('int')

In [14]:
y.dtype

dtype('int32')

In [15]:
x = df.drop(['Survived'], axis=1)

In [16]:
x.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,0,22.0,1,0,7.25
2,1,1,38.0,1,0,71.2833
3,3,1,26.0,0,0,7.925
4,1,1,35.0,1,0,53.1
5,3,0,35.0,0,0,8.05


In [17]:
x.shape, y.shape

((714, 6), (714,))

In [18]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=6)

In [19]:
x_train.shape, x_valid.shape

((571, 6), (143, 6))

In [20]:
tree_model = DecisionTreeClassifier(random_state=6)

In [21]:
cross_val_score(tree_model, x_train, y_train, cv=4)

array([0.86713287, 0.74125874, 0.74125874, 0.71126761])

In [22]:
tree_params = {'max_depth': range(
    1, 11), 'max_features': [.5, .6, .7, .8, .85, .9, .95, 1]}

In [23]:
tree_grid = GridSearchCV(tree_model, tree_params, cv=4, n_jobs=-1)

In [24]:
tree_grid.fit(x_train, y_train)

In [25]:
tree_grid.best_score_

0.8056116418792475

In [26]:
tree_grid.best_params_

{'max_depth': 5, 'max_features': 0.5}

In [27]:
tree_valid_res = tree_grid.predict(x_valid)

In [28]:
tree_valid_res

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1])

In [29]:
y_valid

PassengerId
245    0
332    0
521    1
396    0
555    1
      ..
14     0
663    0
766    1
244    0
550    1
Name: Survived, Length: 143, dtype: int32

In [30]:
accuracy_score(y_valid, tree_valid_res)

0.8321678321678322