# Scikit Learn pipelines

# without_using_pipeline

In [5]:
import numpy as np
import pandas as pd

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [7]:
df=pd.read_csv("C:\\Users\\Admin\\OneDrive\\Desktop\\DataScience\\DataSets\\titanic.csv")

In [18]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [15]:
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [36]:
df['Embarked'].isnull().sum()

np.int64(2)

In [21]:
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [28]:
X_train.shape

(712, 7)

In [20]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [25]:
df['Survived'].isnull().sum() #0 null values in y(target)

np.int64(0)

In [26]:
X_test.isnull().sum()

Pclass       0
Sex          0
Age         37
SibSp        0
Parch        0
Fare         0
Embarked     0
dtype: int64

# Filling null values using SimpleImputer

In [29]:
# Applying imputation

si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

In [30]:
X_train_age.shape

(712, 1)

In [34]:
X_test_embarked

array([['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
      

## AFTER APPLYING SIMPLEIMPUTER 
Case 1: Output is a NumPy array (most common)
import numpy as np

np.isnan(X_imputed).any()

In [40]:
import pandas as pd

pd.isnull(X_test_embarked).any()

np.False_

In [44]:
new_df=np.concatenate((X_train_age,X_train_embarked),axis=1) 
#üìå axis=1 = column-wise concatenation It joins arrays side by side (horizontally).

In [45]:
new_df

array([[45.5, 'S'],
       [23.0, 'S'],
       [32.0, 'S'],
       ...,
       [41.0, 'S'],
       [14.0, 'S'],
       [21.0, 'S']], dtype=object)

In [47]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


# OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)

handle_unknown='ignore' prevents production failures by safely encoding unseen categories as all zeros instead of raising an error.

# Indusrty level
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

categorical_cols = ['Embarked', 'Sex']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

X_processed = preprocessor.fit_transform(df)


In [49]:
# one hot encoding Sex and Embarked

ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [50]:
X_train_sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [51]:
X_train_rem = X_train.drop(columns=['Sex','Age','Embarked'])

In [52]:
X_test_rem = X_test.drop(columns=['Sex','Age','Embarked'])


In [53]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)

In [54]:
X_train_transformed.shape

(712, 10)

In [55]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train) 

# giving own data to predict

In [66]:
for i in range(0,3):
    print(X_test_transformed[i],"\n")

[ 3.          1.          1.         15.2458     29.49884615  0.
  1.          1.          0.          0.        ] 

[ 2.   0.   0.  10.5 31.   0.   1.   0.   0.   1. ] 

[ 3.     0.     0.     7.925 20.     0.     1.     0.     0.     1.   ] 



In [80]:
import numpy as np

sample = np.array([[3., 1., 2., 15.2458, 10.00000, 1., 1., 1., 0., 1.]])

prediction = clf.predict(sample)

print(prediction)


[1]


# Predicting the test data

In [56]:
y_pred = clf.predict(X_test_transformed)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1])

In [57]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7932960893854749

In [None]:
"""
This code saves trained machine learning objects (encoders and model)
to disk using Python's pickle module so they can be reused later
without retraining.

pickle.dump(object, file)

1Ô∏è‚É£ pickle.dump(ohe_sex, open('models/ohe_sex.pkl', 'wb'))

- Saves the trained OneHotEncoder for the 'sex' column.
- 'models/ohe_sex.pkl' ‚Üí file path where it will be stored.
- 'wb' ‚Üí write binary mode (required for pickle).
- This stores learned categories (like encoder.categories_).

2Ô∏è‚É£ pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl', 'wb'))

- Saves the trained OneHotEncoder for the 'embarked' column.
- Keeps the category mappings learned during fit().
- Important so test/deployment data uses SAME encoding mapping.

3Ô∏è‚É£ pickle.dump(clf, open('models/clf.pkl', 'wb'))

- Saves the trained classifier model.
- Stores learned weights/parameters.
- Allows direct prediction later without retraining.

Why this is important in production:

- Prevents retraining every time.
- Ensures consistent preprocessing.
- Enables deployment (Flask, FastAPI, Streamlit, etc.).
- Keeps training-time category mappings fixed.

Later, you load them using:

import pickle

ohe_sex = pickle.load(open('models/ohe_sex.pkl', 'rb'))
ohe_embarked = pickle.load(open('models/ohe_embarked.pkl', 'rb'))
clf = pickle.load(open('models/clf.pkl', 'rb'))

'rb' = read binary mode.

Industry Note:
In production systems, we usually save the entire preprocessing
pipeline + model together using:

pickle.dump(pipeline, open('model.pkl','wb'))

instead of saving encoders separately.
"""


In [82]:
import pickle

In [83]:
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))

In [96]:
df.iloc[:,0]

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [97]:
df


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [201]:
# Assume user input
# Pclass/gender/age/SibSp/Parch/Fare/Embarked
test_input = np.array([2, 'male', 31.0, 0, 0, 10.5,'v'],dtype=object).reshape(1,7)

In [202]:
test_input_sex = ohe_sex.transform(test_input[:,1].reshape(1,1))



In [203]:
test_input_embarked = ohe_embarked.transform(test_input[:,-1].reshape(1,1))

In [204]:
test_input_embarked

array([[0., 0., 0.]])

In [205]:
test_input_age = test_input[:,2].reshape(1,1)

In [206]:
test_input_transformed = np.concatenate((test_input[:,[0,3,4,5]],test_input_age,test_input_sex,test_input_embarked),axis=1)

In [207]:
test_input_transformed.shape

(1, 10)

In [None]:
clf.predict(test_input_transformed)