In [1]:
import pandas as pd
import numpy as np



import matplotlib.pyplot as plt
import seaborn as  sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [57]:
df = pd.read_csv('titanic.csv', usecols=['Age','Fare','Survived'])

In [58]:
df.isnull().sum()


Survived      0
Age         177
Fare          0
dtype: int64

In [59]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)


In [61]:
x=df.iloc[:,1:]
y=df.iloc[:,0]
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=42)

In [62]:
xtrain.head(2)

Unnamed: 0,Age,Fare
331,45.5,28.5
733,23.0,13.0


In [63]:
clf = DecisionTreeClassifier()

In [64]:
clf.fit(xtrain,ytrain)
ypred = clf.predict(xtest)

In [65]:
accuracy_score(ytest, ypred)

0.6536312849162011

In [66]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring='accuracy'))

np.float64(0.6565917602996254)

In [69]:
kbin_age = KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='kmeans')
kbin_fare = KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='kmeans')

In [68]:
trf = ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [71]:
xtrain_trf = trf.fit_transform(xtrain)
xtest_trf = trf.transform(xtest)

In [72]:
trf.named_transformers_['first'].n_bins_

array([5])

In [73]:
output = pd.DataFrame({
    'age' : xtrain['Age'],
    'age_trf': xtrain_trf[:,0],
    'fare': xtrain['Fare'],
    'fare_trf' : xtrain_trf[:,1]
})

In [75]:
output['age_labels'] = pd.cut(x=xtrain['Age'], bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=xtrain['Fare'], bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [76]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
62,45.0,3.0,83.475,1.0,"(42.321, 55.183]","(40.404, 99.496]"
237,8.0,0.0,26.25,0.0,"(0.42, 16.772]","(0.0, 40.404]"
134,25.0,1.0,13.0,0.0,"(16.772, 31.154]","(0.0, 40.404]"
795,39.0,2.0,13.0,0.0,"(31.154, 42.321]","(0.0, 40.404]"
474,22.0,1.0,9.8375,0.0,"(16.772, 31.154]","(0.0, 40.404]"


In [77]:
clf = DecisionTreeClassifier()
clf.fit(xtrain_trf,ytrain)
ypred2 = clf.predict(xtest_trf)

In [78]:
accuracy_score(ytest,ypred2)

0.6424581005586593