In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from joblib import dump, load

Import the data


In [2]:
trainData = pd.read_csv('train.csv')
trainData.info()
trainData.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Get the numbe of missing values


In [3]:
trainData.isnull().sum().sort_values(ascending=False)

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64

We will remove Cabin feature and fill Age with the mean and remove rows with missing Embarked. We will also remove PassengerId as it just an artifical id of a passenger and Name as it's just a name and cannot help the model.


In [4]:
trainData.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
trainData = trainData.loc[trainData['Embarked'].isnull() == False]
meanAge = trainData['Age'].mean()
trainData['Age'].fillna(value = meanAge, inplace=True)

In [5]:
trainData.isnull().sum().sort_values(ascending=False)

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

Cabine feature was successfully removed, no missing values in Age and rows with Embarked missing were removed. Now we have to apply scaling to numerical features and encoding to categorical features.


In [6]:
columnTransformer = make_column_transformer(
    (StandardScaler(), ['Age', 'SibSp', 'Parch', 'Fare']),
    (OneHotEncoder(handle_unknown='error', drop='first'), ['Pclass', 'Sex', 'Embarked'])
)

Then let's create a model and pipeline with the estimator and columnTransformer

In [7]:
model = LogisticRegression()
pipeline = make_pipeline(columnTransformer, model)

Now we need to divide the traning data into test and train set.


In [8]:
predicted = 'Survived'
y_train = trainData[predicted].copy()
X_train = trainData.drop([predicted], axis=1).copy()

Fit the pipeline and score our training set

In [9]:
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)

0.8053993250843644

And we save the pipeline to use in the API

In [10]:
dump(pipeline, 'pipeline.bin', compress=True)

['pipeline.bin']

We can try to load it and see it the result is still the same

In [11]:
pipelineLoaded = load('pipeline.bin')

In [12]:
pipelineLoaded.score(X_train, y_train)

0.8053993250843644