## Use case: Survival data for passengers of the Titanic

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,OrdinalEncoder
import warnings
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import pickle
from sklearn.pipeline import Pipeline
%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("train.csv")

In [5]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Basic Checks '

# EDA '

# Data Preprocessing

In [27]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [29]:
df[['Age','Fare']]

Unnamed: 0,Age,Fare
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
886,27.0,13.0000
887,19.0,30.0000
888,,23.4500
889,26.0,30.0000


# Function transformer :

In [25]:
def divide_by_100(x):
    return (x/100)

- we can use FunctionTrasformer to use our own custom function for transformation.
- So we will use this custom function whe we are defining the pipeline.
- We can define our funtion transformer whenever we need that in our pipeline.

In [32]:
custom=FunctionTransformer(divide_by_100)

# Using Modified Label Encoder '

In [35]:
class ModifiedLabelEncoder(LabelEncoder):
    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

- You use the ColumnTransformer to transform each column set separately and then you can use the pipeline for multiple transformations of the same columns.
- Column Transformer is a scikit-learn class used to create and apply separate transformers for numerical and categorical data. - To create transformers we need to specify the transformer object and pass the list of transformations inside a tuple along with the column on which you want to apply the transformation.

In [38]:
# We are defining here each columns that which scaling techniques are used for which columns :


OHE_columns=['Sex']
standard_scaler=['Fare']
function_trasformer=['Age']
label_encoder=['Embarked']


In [40]:
passthrough=["Pclass", "SibSp", "Parch", "Survived"]

In [42]:
def same(x):
    return x

In [44]:
no_trans=FunctionTransformer(same)

# Defining the preprocessor :

In [47]:
preprocessor=ColumnTransformer([('OHE_columns',OneHotEncoder(),OHE_columns),
                                ('standard_scaler',StandardScaler(),standard_scaler),
                                ('custom',custom,function_trasformer),
                                ('Label_encoder',ModifiedLabelEncoder(),['Embarked']),
                                ('Pass_through',no_trans,passthrough)])

In [49]:
preprocessor

# Creating Pickle file '

In [52]:
file=open("Train.pkl","wb")

In [54]:
pickle.dump(preprocessor,file)

In [56]:
file.close()

In [58]:
file=open("Train.pkl","rb")

In [60]:
pre=pickle.load(file)

In [62]:
pre