# Importing modules


In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline , make_pipeline

# Importing Data set

In [6]:
df = pd.read_csv('titanic.csv')
df.drop(['Cabin','Name','SibSp','Ticket','PassengerId'],inplace=True,axis = 1)

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked
0,0,3,male,22.0,0,7.25,S
1,1,1,female,38.0,0,71.2833,C
2,1,3,female,26.0,0,7.925,S
3,1,1,female,35.0,0,53.1,S
4,0,3,male,35.0,0,8.05,S


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


In [9]:
df.isnull().sum()


Survived      0
Pclass        0
Sex           0
Age         177
Parch         0
Fare          0
Embarked      2
dtype: int64

In [10]:
df.value_counts('Embarked')

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

# train_test_split

In [12]:
x_train , x_test , y_train , y_test = train_test_split(df.drop(['Survived'],axis = 1),
                                                       df['Survived'],
                                                       test_size = 0.2, 
                                                       random_state = 42)

In [13]:
x_train.shape,y_train.shape

((712, 6), (712,))

# Preprocessing

In [15]:
# IMPUTATION
tnf1 = ColumnTransformer(transformers=[
    ('impute_age', SimpleImputer(),[2]),
    ('impute_Embarked',SimpleImputer(strategy='most_frequent'),[-1])
],remainder='passthrough')

In [49]:
# ONE HOT ENCODING
numeric_columns = x_train.select_dtypes(include=['int64', 'float64']).columns
tnf2 = ColumnTransformer(transformers=[
    ('num','passthrough',numeric_columns),
    ('ohe_sex_embarked',OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop = 'first'),[2,5])
],remainder='passthrough')

In [51]:
# SCALLING
tnf3 = ColumnTransformer(transformers=[
    ('sacle',MinMaxScaler(),slice(0,8))
],remainder='passthrough')

# Feature scalling

In [54]:
tnf4 = SelectKBest(score_func=chi2 , k=8)

# Classifier (algo)

In [57]:
tnf5 = DecisionTreeClassifier()

# Create Pipeline

In [60]:
pipe = Pipeline([
    ('tnf1',tnf1),
    ('tnf2',tnf2),
    ('tnf3',tnf3),
    ('tnf4',tnf4),
    ('tnf5',tnf5)
])

# Training

In [63]:
pipe.fit(x_train, y_train)

ValueError: Specifying the columns using strings is only supported for dataframes.