In [284]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [252]:
df = pd.read_csv('train.csv')

In [253]:
df.drop(['Name', 'Cabin', 'Ticket', 'PassengerId'], axis = 1, inplace = True)

In [254]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [255]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived', axis = 1), df['Survived'], test_size=0.33, random_state=42)

In [256]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
6,1,male,54.0,0,0,51.8625,S
718,3,male,,0,0,15.5,Q
685,2,male,25.0,1,2,41.5792,C
73,3,male,26.0,1,0,14.4542,C
882,3,female,22.0,0,0,10.5167,S


In [257]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         118
SibSp         0
Parch         0
Fare          0
Embarked      1
dtype: int64

In [258]:
trf1 = ColumnTransformer(transformers=[
    ('mean_age', SimpleImputer(strategy='mean'), [2])
], remainder= 'passthrough', force_int_remainder_cols=False)

In [259]:
trf2 = ColumnTransformer(transformers=[
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder= 'passthrough', force_int_remainder_cols=False)

In [260]:
pipe = Pipeline([
    ('mean_age', trf1),
    ('impute_embarded', trf2),
])

In [261]:
pipe.fit(X_train)

In [262]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         118
SibSp         0
Parch         0
Fare          0
Embarked      1
dtype: int64

In [263]:
X_train = pipe.transform(X_train)

In [264]:
X_train = pd.DataFrame(X_train, columns = df.columns[1:])

In [265]:
X_train.columns = ['Embarked' ,'Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']

In [266]:
X_train

Unnamed: 0,Embarked,Age,Pclass,Sex,SibSp,Parch,Fare
0,S,54.0,1,male,0,0,51.8625
1,Q,29.525983,3,male,0,0,15.5
2,C,25.0,2,male,1,2,41.5792
3,C,26.0,3,male,1,0,14.4542
4,S,22.0,3,female,0,0,10.5167
...,...,...,...,...,...,...,...
591,S,21.0,3,female,0,0,7.65
592,S,29.525983,1,male,0,0,31.0
593,S,41.0,3,male,2,0,14.1083
594,S,14.0,1,female,1,2,120.0


In [267]:
X_train.head()

Unnamed: 0,Embarked,Age,Pclass,Sex,SibSp,Parch,Fare
0,S,54.0,1,male,0,0,51.8625
1,Q,29.525983,3,male,0,0,15.5
2,C,25.0,2,male,1,2,41.5792
3,C,26.0,3,male,1,0,14.4542
4,S,22.0,3,female,0,0,10.5167


In [268]:
trf3 = ColumnTransformer(transformers=[
    ('ohe_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [0]),
    ('ohe_sex', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [3]),
], remainder = 'passthrough', force_int_remainder_cols=False)

In [269]:
pipe2 = Pipeline([
    ('trf3', trf3)
])

In [270]:
xtrain = X_train

In [271]:
pipe2.fit(xtrain)

In [272]:
xtrain = pipe2.transform(xtrain)

In [273]:
xtrain

array([[0.0, 0.0, 1.0, ..., 0, 0, 51.8625],
       [0.0, 1.0, 0.0, ..., 0, 0, 15.5],
       [1.0, 0.0, 0.0, ..., 1, 2, 41.5792],
       ...,
       [0.0, 0.0, 1.0, ..., 2, 0, 14.1083],
       [0.0, 0.0, 1.0, ..., 1, 2, 120.0],
       [0.0, 0.0, 1.0, ..., 0, 1, 77.2875]], dtype=object)

In [274]:
X_train.head()

Unnamed: 0,Embarked,Age,Pclass,Sex,SibSp,Parch,Fare
0,S,54.0,1,male,0,0,51.8625
1,Q,29.525983,3,male,0,0,15.5
2,C,25.0,2,male,1,2,41.5792
3,C,26.0,3,male,1,0,14.4542
4,S,22.0,3,female,0,0,10.5167


In [275]:
X_test = pipe.transform(X_test)

In [276]:
X_test = pd.DataFrame(X_test, columns = ['Embarked' ,'Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare'])

In [277]:
X_test.head()

Unnamed: 0,Embarked,Age,Pclass,Sex,SibSp,Parch,Fare
0,C,29.525983,3,male,1,1,15.2458
1,S,31.0,2,male,0,0,10.5
2,S,20.0,3,male,0,0,7.925
3,S,6.0,2,female,0,1,33.0
4,C,14.0,3,female,1,0,11.2417


In [280]:
pipe2.fit(X_test)

In [281]:
X_test = pipe2.transform(X_test)

In [283]:
X_test

array([[1.0, 0.0, 0.0, ..., 1, 1, 15.2458],
       [0.0, 0.0, 1.0, ..., 0, 0, 10.5],
       [0.0, 0.0, 1.0, ..., 0, 0, 7.925],
       ...,
       [1.0, 0.0, 0.0, ..., 0, 2, 15.7417],
       [1.0, 0.0, 0.0, ..., 0, 2, 15.2458],
       [0.0, 0.0, 1.0, ..., 0, 0, 7.925]], dtype=object)

In [300]:
le = LabelEncoder()

In [304]:
y_train = le.fit_transform(y_train)

  y = column_or_1d(y, warn=True)


In [316]:
y_train.shape

(596,)

In [317]:
y_test.shape

(295,)

In [318]:
dt = DecisionTreeClassifier()

In [319]:
dt.fit(X_train, y_train)

In [320]:
y_pred = dt.predict(X_test)

In [321]:
from sklearn.metrics import accuracy_score

In [322]:
accuracy_score(y_pred, y_test)

0.7491525423728813