In [1]:
import numpy  as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

In [2]:
from sklearn.compose import ColumnTransformer

In [3]:
train_ds=pd.read_csv('./titanic/train.csv')
test_ds=pd.read_csv('./titanic/test.csv')

In [4]:
train_ds=train_ds.drop(columns=['Name','Ticket','PassengerId','Cabin'],axis=1)
test_ds=test_ds.drop(columns=['Name','Ticket','PassengerId','Cabin'],axis=1)

In [5]:
Numeric_cols=train_ds.select_dtypes(include=['number']).columns.tolist()
categorical_cols=train_ds.select_dtypes(include=['object','category']).columns.tolist()


In [6]:
for col in Numeric_cols:
    print(col,"(Nan count)=>",train_ds[col].isna().sum())

Survived (Nan count)=> 0
Pclass (Nan count)=> 0
Age (Nan count)=> 177
SibSp (Nan count)=> 0
Parch (Nan count)=> 0
Fare (Nan count)=> 0


In [7]:
for col in categorical_cols:
    print(col,"(Nan count)=>",train_ds[col].isna().sum(),train_ds[col].value_counts())

Sex (Nan count)=> 0 Sex
male      577
female    314
Name: count, dtype: int64
Embarked (Nan count)=> 2 Embarked
S    644
C    168
Q     77
Name: count, dtype: int64


## **Train Test Split**

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
target= train_ds['Survived']
train_ds=train_ds.drop(columns='Survived', axis=1)
X_train, X_test, y_train, y_test=train_test_split(train_ds, target, test_size=0.2, random_state=42)

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((712, 7), (712,), (179, 7), (179,))

## **Pipeline**

In [11]:
from  sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [12]:
X_train.info(),y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       572 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  710 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 44.5+ KB
<class 'pandas.core.series.Series'>
Index: 712 entries, 331 to 102
Series name: Survived
Non-Null Count  Dtype
--------------  -----
712 non-null    int64
dtypes: int64(1)
memory usage: 11.1 KB


(None, None)

In [13]:
#fill nan in Age_col
trf1=ColumnTransformer([
    ('impute_age',SimpleImputer(strategy='mean'),[2]),
    ('impute_Emberked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [14]:
#sex_OHE
trf2=ColumnTransformer([
    ('sex_encoder',OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[1])
],remainder='passthrough')

In [15]:
#sex_OHE
trf3=ColumnTransformer([
    ('Emberked_encoder',OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[6])
],remainder='passthrough')

In [16]:
#scaling
trf4=ColumnTransformer([
    ('scaler',MinMaxScaler(),slice(0,10))
])

In [17]:
#feature
trf5=SelectKBest(score_func=chi2, k=7)

In [18]:
#model
trf6=DecisionTreeClassifier()

pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [19]:
pipe=make_pipeline(trf1,trf2,trf3,trf4,trf5,trf6)

In [20]:
#X_train_df = pd.DataFrame(X_train,columns=train_ds.columns)
#y_train_df = pd.DataFrame(y_train, columns=train_ds.columns)

pipe.fit(X_train,y_train)

In [21]:
from sklearn import set_config
set_config(display='diagram')

In [23]:
y_pred=pipe.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6480446927374302

In [25]:
X_test.shape

(179, 7)

In [26]:
train_ds.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [27]:
import pickle
pickle.dump(pipe, open('pipe_dt.pkl','wb'))