In [50]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder, FunctionTransformer, Normalizer
from sklearn.linear_model import LogisticRegression

In [51]:
df = pd.read_csv('train.csv')
dfout = df['Survived']
df = df.drop('Survived', axis=1)

# column transformer planning

to drop: 
- passengerId
- Survived
- Name
- ticket

One hot encoding:
- sex
- embarked

convert NaN to 0:
- Cabin

Normalize:
- Age
- Fare

In [52]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
def NaNRemover(X):
    if isinstance(X, pd.DataFrame):
        return X['Cabin'].fillna('0')
    else:
        return np.where(pd.isnull(X), '0', X)

# Construct the column trasnformer

In [54]:
def NaNFiller(X):
    if isinstance(X, pd.DataFrame):
        X['Age'] = X['Age'].fillna(X['Age'].mean())
    elif isinstance(X, np.ndarray):
        # Convert to DataFrame to use pandas fillna
        df = pd.DataFrame(X, columns=['Age'] if X.shape[1] == 1 else None)
        df['Age'] = df['Age'].fillna(df['Age'].mean())
        X = df.to_numpy()
    return X

def NaNRemover(X):
    X = X.map(lambda x: 0 if isinstance(x, float) else 1)
    return X
    # if isinstance(X, pd.DataFrame):
    #     return X.fillna('0')
    # else:
    #     return np.where(pd.isnull(X), '0', X)
    
# def binaryconverter(X: pd.DataFrame):
#     X = X.map(lambda x: 0 if x == '0' else 1)
#     return X

ct = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(), ['Sex', 'Embarked']),
        ('nrem', FunctionTransformer(NaNRemover), ['Cabin']),
        ('nmean', FunctionTransformer(NaNFiller), ['Age']),
        ('minmax', MinMaxScaler(feature_range=(1,2)), ['Age']),
    ],
    remainder='drop'
)

In [61]:
clt = ct.fit_transform(df)


array([[ 0.        ,  1.        ,  0.        , ...,  0.        ,
        22.        ,  1.27117366],
       [ 1.        ,  0.        ,  1.        , ...,  1.        ,
        38.        ,  1.4722292 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        26.        ,  1.32143755],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        29.69911765,         nan],
       [ 0.        ,  1.        ,  1.        , ...,  1.        ,
        26.        ,  1.32143755],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        32.        ,  1.39683338]])

In [62]:
pd.DataFrame(clt)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,22.000000,1.271174
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,38.000000,1.472229
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,26.000000,1.321438
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,35.000000,1.434531
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,35.000000,1.434531
...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,0.0,0.0,1.0,0.0,0.0,27.000000,1.334004
887,1.0,0.0,0.0,0.0,1.0,0.0,1.0,19.000000,1.233476
888,1.0,0.0,0.0,0.0,1.0,0.0,0.0,29.699118,
889,0.0,1.0,1.0,0.0,0.0,0.0,1.0,26.000000,1.321438


In [58]:
pipe = Pipeline([
    ('preprocessing', ct)
])

In [59]:
trained_pipe = pipe.fit(df, dfout)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('ohe', OneHotEncoder(),
                                                  ['Sex', 'Embarked']),
                                                 ('nrem',
                                                  FunctionTransformer(func=<function NaNRemover at 0x00000250D1535480>),
                                                  ['Cabin']),
                                                 ('nmean',
                                                  FunctionTransformer(func=<function NaNFiller at 0x00000250D1535CF0>),
                                                  ['Age']),
                                                 ('minmax',
                                                  MinMaxScaler(feature_range=(1,
                                                                              2)),
                                                  ['Age'])]))])


In [None]:
# from sklearn.model_selection import train_test_split

# xtrain, xtest, ytrain, ytest = train_test_split()