# Example data cleaning

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline

from cleaners import data_types, drop_replace, eliminate_feats, impute
from cleaners.indicators import AddIndicators

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

### Add some bogus columns


In [2]:
rs = np.random.RandomState(0)
sz = len(X)

# add dupe columns

X["float_to_str"] = rs.normal(scale=0.4, size=sz).astype(str)  # should get converted to float
X["Some<Colu,mn,NAme"] = rs.random_sample(size=sz)
X["zero_variance"] = np.ones(sz)
X["all_nans"] = pd.Series(len(X) * [np.nan])
X["correlated_to_age_1"] = 0.5 * X["age"] + rs.normal(scale=0.1, size=sz) + 6
X["correlated_to_age_2"] = X["age"] + rs.normal(scale=0.4, size=sz) - 7
y[pd.Series(y.index).sample(frac=0.05)] = np.nan
X["target"] = y

# add some nans
X.loc[pd.Series(X.index.values).sample(frac=0.1), "fare"] = np.nan

# add dupes
X = X.sample(n=10000, replace=True)
X["ix"] = pd.Series(np.ones(len(X))).cumsum() + 1
X.set_index("ix")
len(X)

10000

## Split dataset

In [3]:
X_train, X_test = train_test_split(X, test_size=0.2)
len(X_train)

8000

In [4]:
X.columns

Index(['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
       'cabin', 'embarked', 'boat', 'body', 'home.dest', 'float_to_str',
       'Some<Colu,mn,NAme', 'zero_variance', 'all_nans', 'correlated_to_age_1',
       'correlated_to_age_2', 'target', 'ix'],
      dtype='object')

### Run like any sklearn transformers

In [5]:
X_train =  drop_replace.DropNa(subset=["target"]).fit_transform(X_train)
len(X_train)

7639

### Set up data clean pipeline

In [6]:
mandatory_columns = ["pclass"]

pipeline_steps = [
    ("pre_drop", drop_replace.DropNamedCol(drop_cols=["name", "ticket", "home.dest"])),
    (
        "fix_types",
        data_types.FixDTypes(verbose=True),
    ),
    ("replace_bad_names", drop_replace.ReplaceBadColnameChars()),
    (
        "drop_mostly_nan",
        eliminate_feats.DropMostlyNaN(
            nan_frac_thresh=0.5,
            mandatory=mandatory_columns,
            skip_if_missing=True,
            sample_rate=0.3,
        ),
    ),

    (
        "drop_uninformative_1",
        eliminate_feats.DropUninformative(mandatory=mandatory_columns, sample_rate=0.3),
    ),
    (
        "impute",
        impute.ImputeByValue(
            cols=["fare"],
            imputer_kwargs=dict(add_indicator=True, strategy="median"),
            sample_rate=0.3,
        ),
    ),
    (
        "indicators",
        AddIndicators(
            values_to_indicate={"boat": ["None", "13 15 B", "C D"], "cabin": ["None"]},
            sample_rate=0.3,
        ),
    ),
    (
        "feature_elim_1",
        eliminate_feats.HighCorrelationElim(
            mandatory=mandatory_columns, sample_rate=0.3
        ),
    ),
]

In [7]:
# run the pipeline
your_pipeline = Pipeline(pipeline_steps)
X_train_tr = your_pipeline.fit_transform(X_train)
X_train_tr

cleaners.cleaner_base.get_sample_df:
The sample dataframe is smaller than 1000 rowsThis may not be large enough to adequately infer info about your data.


Unnamed: 0,fare,missingindicator_fare,pclass,sex,age,sibsp,parch,cabin,embarked,boat,...,boat_C D,cabin_None,embarked_S,embarked_C,embarked_Q,embarked_nan,pclass_3.0,pclass_2.0,pclass_1.0,sex_female
72,136.7792,0.0,1.0,female,26.0,1.0,0.0,C89,C,4,...,0,0,0,1,0,0,0,0,1,1
1148,7.1250,0.0,3.0,male,35.0,0.0,0.0,,S,,...,0,1,1,0,0,0,1,0,0,0
879,7.8958,0.0,3.0,male,,0.0,0.0,,S,,...,0,1,1,0,0,0,1,0,0,0
336,26.0000,0.0,2.0,male,32.0,1.0,0.0,,S,13,...,0,1,1,0,0,0,0,1,0,0
849,13.8583,1.0,3.0,male,26.0,1.0,0.0,,S,,...,0,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,135.6333,0.0,1.0,female,55.0,0.0,0.0,C32,C,8,...,0,0,0,1,0,0,0,0,1,1
134,89.1042,0.0,1.0,female,,1.0,0.0,C92,C,5,...,0,0,0,1,0,0,0,0,1,1
154,93.5000,0.0,1.0,male,55.0,1.0,1.0,B69,S,,...,0,0,1,0,0,0,0,0,1,0
869,22.5250,0.0,3.0,male,28.0,0.0,0.0,,S,,...,0,1,1,0,0,0,1,0,0,0


In [8]:
assert X_train_tr.dtypes["float_to_str"] == "float64"

### Apply on test dataset

In [9]:
X_test_tr = your_pipeline.transform(X_test)

In [10]:
X_test_tr

Unnamed: 0,fare,missingindicator_fare,pclass,sex,age,sibsp,parch,cabin,embarked,boat,...,boat_C D,cabin_None,embarked_S,embarked_C,embarked_Q,embarked_nan,pclass_3.0,pclass_2.0,pclass_1.0,sex_female
203,82.1708,0.0,1.0,male,28.0000,1.0,0.0,,C,,...,0,1,0,1,0,0,0,0,1,0
661,8.0500,0.0,3.0,female,18.0000,0.0,0.0,,S,C,...,0,1,1,0,0,0,1,0,0,1
778,12.4750,0.0,3.0,female,30.0000,0.0,0.0,,S,13,...,0,1,1,0,0,0,1,0,0,1
200,75.2417,0.0,1.0,male,46.0000,0.0,0.0,C6,C,,...,0,0,0,1,0,0,0,0,1,0
830,46.9000,0.0,3.0,male,14.0000,5.0,2.0,,S,,...,0,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,42.4000,0.0,1.0,male,,0.0,0.0,,S,,...,0,1,1,0,0,0,0,0,1,0
91,57.0000,0.0,1.0,male,31.0000,1.0,0.0,B20,S,3,...,0,0,1,0,0,0,0,0,1,0
548,18.7500,0.0,2.0,male,0.8333,1.0,1.0,,S,4,...,0,1,1,0,0,0,0,1,0,0
313,211.5000,0.0,1.0,male,27.0000,0.0,2.0,C82,C,,...,0,0,0,1,0,0,0,0,1,0


In [11]:
X_test_tr.columns.tolist()

['fare',
 'missingindicator_fare',
 'pclass',
 'sex',
 'age',
 'sibsp',
 'parch',
 'cabin',
 'embarked',
 'boat',
 'float_to_str',
 'SomeColumnNAme',
 'target',
 'ix',
 'boat_None',
 'boat_13 15 B',
 'boat_C D',
 'cabin_None',
 'embarked_S',
 'embarked_C',
 'embarked_Q',
 'embarked_nan',
 'pclass_3.0',
 'pclass_2.0',
 'pclass_1.0',
 'sex_female']

### Objects are picklable for future use

In [12]:
import pickle
import os

os.makedirs("tmp", exist_ok=True)


with open("./tmp/output.pkl", "wb") as f:
    pickle.dump(your_pipeline, f)

In [13]:
with open("./tmp/output.pkl", "rb") as f:
    obj = pickle.load(f)
obj

## Also works with dask dataframes

In [14]:
import dask.dataframe as dd

X_test_tr_dd = obj.transform(dd.from_pandas(X_test, npartitions=3)).compute()

In [15]:
assert (X_test_tr.columns == X_train_tr.columns).all()

In [16]:
pd.testing.assert_frame_equal(X_test_tr_dd.sort_index(), X_test_tr.sort_index())

In [17]:
import shutil

shutil.rmtree("./tmp")