# Example data cleaning

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline

from cleaners import data_types, drop_replace, eliminate_feats, impute
from cleaners.indicators import AddIndicators

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

### Add some bogus columns


In [2]:
rs = np.random.RandomState(0)
sz = len(X)

# add dupe columns

X["float_to_str"] = rs.normal(scale=0.4, size=sz).astype(str)  # should get converted to float
X["Some<Colu,mn,NAme"] = rs.random_sample(size=sz)
X["zero_variance"] = np.ones(sz)
X["all_nans"] = pd.Series(len(X) * [np.nan])
X["correlated_to_age_1"] = 0.5 * X["age"] + rs.normal(scale=0.1, size=sz) + 6
X["correlated_to_age_2"] = X["age"] + rs.normal(scale=0.4, size=sz) - 7
y[pd.Series(y.index).sample(frac=0.05)] = np.nan
X["target"] = y

# add some nans
X.loc[pd.Series(X.index.values).sample(frac=0.1), "fare"] = np.nan

# add dupes
X = X.sample(n=10000, replace=True)
X["ix"] = pd.Series(np.ones(len(X))).cumsum() + 1
X.set_index("ix")
len(X)

10000

## Split dataset

In [3]:
X_train, X_test = train_test_split(X, test_size=0.2)
len(X_train)

8000

### Run like any sklearn transformers

In [4]:
X_train =  drop_replace.DropNa(subset=["target"]).fit_transform(X_train)
len(X_train)

7620

### Set up data clean pipeline

In [5]:
mandatory_columns = ["pclass"]

pipeline_steps = [
    ("pre_drop", drop_replace.DropNamedCol(drop_cols=["name", "ticket", "home.dest"])),
    (
        "fix_types",
        data_types.FixDTypes(verbose=True),
    ),
    ("replace_bad_names", drop_replace.ReplaceBadColnameChars()),
    (
        "drop_mostly_nan",
        eliminate_feats.DropMostlyNaN(
            nan_frac_thresh=0.5,
            mandatory=mandatory_columns,
            skip_if_missing=True,
            sample_rate=0.3,
        ),
    ),

    (
        "drop_uninformative_1",
        eliminate_feats.DropUninformative(mandatory=mandatory_columns, sample_rate=0.3),
    ),
    (
        "impute",
        impute.ImputeByValue(
            cols=["fare"],
            imputer_kwargs=dict(add_indicator=True, strategy="median"),
            sample_rate=0.3,
        ),
    ),
    (
        "indicators",
        AddIndicators(
            values_to_indicate={"boat": ["None", "13 15 B", "C D"], "cabin": ["None"]},
            sample_rate=0.3,
        ),
    ),
    (
        "feature_elim_1",
        eliminate_feats.HighCorrelationElim(
            mandatory=mandatory_columns, sample_rate=0.3
        ),
    ),
]

In [6]:
# run the pipeline
your_pipeline = Pipeline(pipeline_steps)
X_train_tr = your_pipeline.fit_transform(X_train)
X_train_tr

cleaners.cleaner_base.get_sample_df:
The sample dataframe is smaller than 1000 rowsThis may not be large enough to adequately infer info about your data.


Unnamed: 0,fare,missingindicator_fare,pclass,sex,age,sibsp,parch,cabin,embarked,boat,...,boat_C D,cabin_None,embarked_S,embarked_C,embarked_Q,embarked_nan,pclass_2.0,pclass_1.0,pclass_3.0,sex_male
708,7.8542,0.0,3.0,male,24.0,0.0,0.0,,S,,...,0,1,1,0,0,0,0,0,1,1
1085,7.3125,0.0,3.0,male,,0.0,0.0,,S,,...,0,1,1,0,0,0,0,0,1,1
167,90.0000,0.0,1.0,female,35.0,1.0,0.0,C93,S,D,...,0,0,1,0,0,0,0,1,0,0
784,13.9000,0.0,3.0,male,23.0,1.0,0.0,,S,,...,0,1,1,0,0,0,0,0,1,1
571,13.0000,0.0,2.0,male,23.0,0.0,0.0,,S,,...,0,1,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1205,7.4958,0.0,3.0,female,18.0,0.0,0.0,,S,16,...,0,1,1,0,0,0,0,0,1,0
117,56.9292,0.0,1.0,female,30.0,0.0,0.0,E36,C,1,...,0,0,0,1,0,0,0,1,0,0
85,71.2833,0.0,1.0,female,38.0,1.0,0.0,C85,C,4,...,0,0,0,1,0,0,0,1,0,0
696,14.1083,1.0,3.0,female,21.0,0.0,0.0,,S,,...,0,1,1,0,0,0,0,0,1,0


In [7]:
assert X_train_tr.dtypes["float_to_str"] == "float64"

### Apply on test dataset

In [8]:
X_test_tr = your_pipeline.transform(X_test)

In [9]:
X_test_tr

Unnamed: 0,fare,missingindicator_fare,pclass,sex,age,sibsp,parch,cabin,embarked,boat,...,boat_C D,cabin_None,embarked_S,embarked_C,embarked_Q,embarked_nan,pclass_2.0,pclass_1.0,pclass_3.0,sex_male
708,7.8542,0.0,3.0,male,24.0,0.0,0.0,,S,,...,0,1,1,0,0,0,0,0,1,1
591,27.7500,0.0,2.0,female,5.0,1.0,2.0,,S,10,...,0,1,1,0,0,0,1,0,0,0
643,31.3875,0.0,3.0,female,5.0,4.0,2.0,,S,15,...,0,1,1,0,0,0,0,0,1,0
12,69.3000,0.0,1.0,female,24.0,0.0,0.0,B35,C,9,...,0,0,0,1,0,0,0,1,0,0
84,14.1083,1.0,1.0,male,39.0,1.0,0.0,C85,C,,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,23.4500,0.0,3.0,male,,1.0,2.0,,S,,...,0,1,1,0,0,0,0,0,1,1
1221,7.8958,0.0,3.0,male,,0.0,0.0,,S,,...,0,1,1,0,0,0,0,0,1,1
1146,29.1250,0.0,3.0,female,39.0,0.0,5.0,,Q,,...,0,1,0,0,1,0,0,0,1,0
97,247.5208,0.0,1.0,female,27.0,1.0,1.0,B58 B60,C,6,...,0,0,0,1,0,0,0,1,0,0


## Also works with dask dataframes

In [10]:
import dask.dataframe as dd

X_test_tr_dd = your_pipeline.transform(dd.from_pandas(X_test, npartitions=3)).compute()

In [11]:
assert (X_test_tr.columns == X_train_tr.columns).all()

In [12]:
pd.testing.assert_frame_equal(X_test_tr_dd.sort_index(), X_test_tr.sort_index())