In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('/content/drive/MyDrive/Dataset/EDA/train.csv',usecols=['Age','Fare','Survived'])

In [None]:
df

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.2500
1,1,38.0,71.2833
2,1,26.0,7.9250
3,1,35.0,53.1000
4,0,35.0,8.0500
...,...,...,...
886,0,27.0,13.0000
887,1,19.0,30.0000
888,0,,23.4500
889,1,26.0,30.0000


In [None]:
df.dropna(inplace=True)

In [None]:
df

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.2500
1,1,38.0,71.2833
2,1,26.0,7.9250
3,1,35.0,53.1000
4,0,35.0,8.0500
...,...,...,...
885,0,39.0,29.1250
886,0,27.0,13.0000
887,1,19.0,30.0000
889,1,26.0,30.0000


In [None]:
x_train,x_test,y_train,y_test=train_test_split(df.drop('Survived',axis=1),df['Survived'],test_size=0.2,random_state=42)

In [None]:
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)


In [None]:
train_score=dt.score(x_train,y_train)
test_score=accuracy_score(y_test,y_pred)

In [None]:
train_score

0.9754816112084063

In [None]:
test_score

0.6223776223776224

In [None]:
np.mean(cross_val_score(dt,x_train,y_train,cv=10,scoring='accuracy'))

0.6235329703569269

In [None]:
kbin_age=KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')
kbin_fare=KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')

In [None]:
trf=ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [None]:
x_train_trf=trf.fit_transform(x_train)
x_test_trf=trf.transform(x_test)

In [None]:
trf_dt=DecisionTreeClassifier()
trf_dt.fit(x_train_trf,y_train)

In [None]:
y_pred_trf=trf_dt.predict(x_test_trf)

In [None]:
train_score_trf=trf_dt.score(x_train_trf,y_train)
test_score_trf=accuracy_score(y_test,y_pred_trf)

In [None]:
train_score_trf

0.8143607705779334

In [None]:
test_score_trf

0.6363636363636364

# Binarization

In [None]:
 df_b=pd.read_csv('/content/drive/MyDrive/Dataset/EDA/train.csv')[['Age','Fare','SibSp','Parch','Survived']]

In [None]:
df_b

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.2500,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.9250,0,0,1
3,35.0,53.1000,1,0,1
4,35.0,8.0500,0,0,0
...,...,...,...,...,...
886,27.0,13.0000,0,0,0
887,19.0,30.0000,0,0,1
888,,23.4500,1,2,0
889,26.0,30.0000,0,0,1


In [None]:
df_b['family']=df_b['SibSp']+df_b['Parch']

In [None]:
df_b

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,family
0,22.0,7.2500,1,0,0,1
1,38.0,71.2833,1,0,1,1
2,26.0,7.9250,0,0,1,0
3,35.0,53.1000,1,0,1,1
4,35.0,8.0500,0,0,0,0
...,...,...,...,...,...,...
886,27.0,13.0000,0,0,0,0
887,19.0,30.0000,0,0,1,0
888,,23.4500,1,2,0,3
889,26.0,30.0000,0,0,1,0


In [None]:
df_b.drop(columns=['SibSp','Parch'],inplace=True)

In [None]:
X=df_b.drop(columns=['Survived'])
y=df_b['Survived']
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
x_train.head()

Unnamed: 0,Age,Fare,family
331,45.5,28.5,0
733,23.0,13.0,0
382,32.0,7.925,0
704,26.0,7.8542,1
813,6.0,31.275,6


In [None]:
 from sklearn.preprocessing import Binarizer

In [None]:
trf_b=ColumnTransformer([('bin',Binarizer(copy=False),['family'])],remainder='passthrough')


In [None]:
trf_dt=DecisionTreeClassifier()
trf_dt.fit(trf_b.fit_transform(x_train),y_train)

In [None]:
x_train_trf=trf_b.fit_transform(x_train)
x_test_trf=trf_b.transform(x_test)

In [None]:
y_pred_trf_b=trf_dt.predict(x_test_trf)
train_score_trf_b=trf_dt.score(x_train_trf,y_train)
test_score_trf_b=accuracy_score(y_test,y_pred_trf_b)

In [None]:
train_score_trf_b

0.9592696629213483

In [None]:
test_score_trf_b

0.6312849162011173