In [108]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [109]:
df = pd.read_csv('titanic dataset.csv')

In [110]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [111]:
df.columns


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [112]:
df.shape


(418, 12)

In [113]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [114]:
df.drop(columns = ["PassengerId","Name","Ticket","Cabin"], inplace = True)

In [115]:
df


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0000,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...,...
413,0,3,male,,0,0,8.0500,S
414,1,1,female,39.0,0,0,108.9000,C
415,0,3,male,38.5,0,0,7.2500,S
416,0,3,male,,0,0,8.0500,S


In [116]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [117]:
df_x = df.drop(columns = ["Survived"])
df_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [118]:
df_y = df["Survived"]
df_y

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [119]:
x_train,x_test,y_train,y_test = train_test_split(df_x,df_y,test_size=0.2,
                                                random_state=42,shuffle = True)

In [120]:
#Applying imputer to fill null columns
si_age = SimpleImputer()
si_fare =SimpleImputer()
x_train_si_age = si_age.fit_transform(x_train[['Age']])
x_train_si_fare = si_fare.fit_transform(x_train[["Fare"]])

x_test_si_age = si_age.fit_transform(x_test[['Age']])
x_test_si_fare = si_fare.fit_transform(x_test[["Fare"]])

In [121]:
x_train.isnull().sum()

Pclass       0
Sex          0
Age         72
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [122]:
#Applying of one hot Encoder
ohe_sex = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
ohe_embarked = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

x_train_ohe_sex = ohe_sex.fit_transform(x_train[['Sex']])
x_train_ohe_embarked = ohe_embarked.fit_transform(x_train[['Embarked']])

x_test_ohe_sex = ohe_sex.fit_transform(x_test[['Sex']])
x_test_ohe_embarked = ohe_embarked.fit_transform(x_test[['Embarked']])


In [123]:
x_train_rem = x_train.drop(columns =["Age","Fare","Sex","Embarked"])
x_test_rem = x_test.drop(columns =["Age","Fare","Sex","Embarked"])

In [124]:
x_train_transformed = np.concatenate((x_train_rem,x_train_si_age,x_train_si_fare,
                                     x_train_ohe_sex,x_train_ohe_embarked), axis =1)
x_test_transformed = np.concatenate((x_test_rem,x_test_si_age,x_test_si_fare,
                                     x_test_ohe_sex,x_test_ohe_embarked),axis=1)

In [125]:
x_train_transformed.shape

(334, 10)

In [126]:
clf = DecisionTreeClassifier()
clf.fit(x_train_transformed,y_train)

In [127]:
predicated = clf.predict(x_test_transformed)

In [128]:
predicated

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1])

In [130]:
accuracy_score(y_test,predicated)

1.0