<a href="https://colab.research.google.com/github/PabloJRW/titanic-classifier/blob/main/notebooks/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from joblib import dump

In [2]:
num_features = ['Fare']
cat_features = ['Pclass', 'Sex', 'SibSp', 'Embarked']

In [3]:
# Importing regular expression library
import re

def extract_title(series):
    """
    Extract the social title from name.
    E.g Mr, Mrs, Miss
    """
    pattern = r",\s(.+?)\s"
    df_col = pd.Series(series).fillna("")
    titles = df_col.apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else "")
    return titles

In [4]:
# numerical transformer pipeline (only 'fare')
num_transformer = Pipeline(steps=[
    ('log_transform', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler( ))]
)

# categorical transformer pipeline
cat_transformer = Pipeline(
    steps=[ 
      ('imputer', SimpleImputer(strategy='most_frequent')),
      ('ohe', OneHotEncoder(handle_unknown='ignore'))]
)

# 
socialt_transformer = Pipeline(
    steps=[
        ('extractor', FunctionTransformer(extract_title)),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))]
)

# Final pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
      ('num_transfomer', num_transformer, num_features),
      ('cat_transformer', cat_transformer, cat_features),
      #('extract', socialt_transformer, 'Name')
    ],remainder='drop'
)


In [5]:
# importing the training set
url_train = "https://github.com/PabloJRW/titanic-classifier/raw/main/train.csv"
df_train = pd.read_csv(url_train)

In [6]:
df_train_transformed = preprocessing_pipeline.fit_transform(df_train)

In [7]:
df_train_transformed


array([[-0.87974057,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.36121993,  1.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.79853997,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.24200664,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.48708246,  1.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.81898658,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [8]:
# Saving the pipeline using joblib
dump(preprocessing_pipeline, 'preprocessing_pipeline.joblib')

['preprocessing_pipeline.joblib']