<a href="https://colab.research.google.com/github/PabloJRW/titanic-classifier/blob/main/notebooks/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from joblib import dump

In [2]:
def replace_values(series):
    series[series >= 3] = 2
    return series

In [3]:
# Importing regular expression library
import re

def extract_title(series):
    """
    Extract the social title from name.
    E.g Mr, Mrs, Miss
    """
    pattern = r",\s(.+?)\s"
    df_col = pd.Series(series).fillna("")
    titles = df_col.apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else "")
    return titles

In [10]:
# numerical transformer pipeline (only 'fare')
num_transformer = Pipeline(
    steps=[
        ('log_transform', FunctionTransformer(np.log1p)),
        ('scaler', StandardScaler( ))
    ]
)

# categorical transformer pipeline
cat_transformer = Pipeline(
    steps=[ 
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]
)

mapper = Pipeline(
    steps=[
        ('mapper', FunctionTransformer(replace_values)),
    ]
)

# 
socialt_transformer = Pipeline(
    steps=[
        ('extractor', FunctionTransformer(extract_title)),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]
)

# Final pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('extract', socialt_transformer, 3),
        #('num_transfomer', num_transformer, [9]),
        ('cat_transformer', cat_transformer, [2, 4, 11]),
        ('mapper_transformer', mapper, [6, 7])
    ],remainder='drop', verbose=True
)


In [11]:
# importing the training set
url_train = "https://raw.githubusercontent.com/PabloJRW/titanic-classifier/main/datasets/raw/train.csv"
df_train = pd.read_csv(url_train)
df_train = np.array(df_train)

In [12]:
df_train_transformed = preprocessing_pipeline.fit_transform(df_train)

ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.

In [None]:
df_train_transformed


In [None]:
# Saving the pipeline using joblib
dump(preprocessing_pipeline, 'preprocessing_pipeline.joblib')