In [53]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [26]:
df = pd.read_csv('train.csv')

In [27]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [29]:
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

In [30]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [31]:
x = df.drop(columns=['Survived'])
y = df['Survived']

In [32]:
x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [33]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=45)

In [34]:
x_train.shape

(712, 7)

In [35]:
x_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [36]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 332 to 414
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       568 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  710 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 44.5+ KB


In [37]:
# Identify categorical and numerical columns
numerical_features = ['Age', 'Fare','Parch','SibSp']
categorical_features = ['Sex', 'Embarked', 'Pclass']

# Define preprocessing steps for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing steps for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [38]:
# Model Selection
trf5 =DecisionTreeClassifier()

In [39]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),

                        ('classifier', trf5)])

In [40]:
pipeline

In [41]:
pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='median')),
                                                   ('scaler', StandardScaler())]),
                                   ['Age', 'Fare', 'Parch', 'SibSp']),
                                  ('cat',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('onehot',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   ['Sex', 'Embarked', 'Pclass'])])),
 ('classifier', DecisionTreeClassifier())]

In [42]:
pipeline.fit(x_train,y_train)

In [43]:
y_pred=pipeline.predict(x_test)

In [44]:
y_pred

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0], dtype=int64)

In [45]:
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision Score",precision_score(y_test,y_pred))

Accuracy 0.7541899441340782
Precision Score 0.618421052631579


In [46]:
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred))

Confusion Matrix:
 [[88 29]
 [15 47]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.75      0.80       117
           1       0.62      0.76      0.68        62

    accuracy                           0.75       179
   macro avg       0.74      0.76      0.74       179
weighted avg       0.77      0.75      0.76       179



In [49]:
#Cross Validation using Pipe Line
scores  = cross_val_score(pipeline,x_train,y_train,cv=10,scoring='accuracy')
for i in range(len(scores)):
    print("Fold",i+1,":",scores[i])
print("Maximum Accuracy: ", max(scores))

Fold 1 : 0.7777777777777778
Fold 2 : 0.75
Fold 3 : 0.7887323943661971
Fold 4 : 0.7746478873239436
Fold 5 : 0.7464788732394366
Fold 6 : 0.7605633802816901
Fold 7 : 0.7605633802816901
Fold 8 : 0.7605633802816901
Fold 9 : 0.7323943661971831
Fold 10 : 0.704225352112676
Maximum Accuracy:  0.7887323943661971


In [51]:
scores  = cross_val_score(pipeline,x_test,y_pred,cv=10,scoring='accuracy')
for i in range(len(scores)):
    print("Fold",i+1,":",scores[i])
print("Maximum Accuracy: ", max(scores))

Fold 1 : 0.6111111111111112
Fold 2 : 0.6666666666666666
Fold 3 : 0.6666666666666666
Fold 4 : 0.7222222222222222
Fold 5 : 0.7777777777777778
Fold 6 : 0.8888888888888888
Fold 7 : 0.6111111111111112
Fold 8 : 0.6666666666666666
Fold 9 : 0.6666666666666666
Fold 10 : 0.5294117647058824
Maximum Accuracy:  0.8888888888888888


In [52]:
params = {
    'classifier__max_depth': [1, 2, 3, 4, 5, None],
    'classifier__criterion':['gini','entropy']
}

In [54]:
# Initialize GridSearchCV
grid = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='accuracy')

# Fit on training data
grid.fit(x_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid.best_params_)
print("Best Accuracy:", grid.best_score_)

Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 3}
Best Accuracy: 0.8117206736925047


In [None]:
#Exporting Model in Pickle

In [55]:
import pickle

In [56]:
pickle.dump(pipeline,open('pipe.pkl','wb'))

In [57]:
pickle.dump(grid.best_estimator_, open('pipe_best.pkl', 'wb'))
print("Best pipeline saved as pipe.pkl")

Best pipeline saved as pipe.pkl
