In [None]:
import pandas as pd
import numpy as np

from cleaning_pipeline import *

In [None]:
df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
cleaned_df = (
    df.pipe(start_pipeline)
    .pipe(drop_noisy_columns, cols=["customerID"])
    .pipe(replace_empty_strings_with_nan)
)

In [None]:
cleaned_df.isna().sum() / len(df)

We can see that the percentage of missing values in the `TotalCharges` column is less than 1%, so we can drop all rows with missing values.

In [None]:
cleaned_df = cleaned_df.pipe(drop_missing_values)

In [None]:
cleaned_df.head()

Now we can convert each column to the appropriate `dtype`.

In [None]:
cleaned_df = cleaned_df.pipe(
    convert_column_dtypes, {"SeniorCitizen": "str", "TotalCharges": np.float}
)

In [None]:
cleaned_df.dtypes

Our final cleaning and processing pipeline would be:

In [None]:
cleaned_df = (
    df.pipe(start_pipeline)
    .pipe(drop_noisy_columns, cols=["customerID"])
    .pipe(replace_empty_strings_with_nan)
    .pipe(drop_missing_values)
    .pipe(
        convert_column_dtypes,
        dtypes_mapping={"SeniorCitizen": "str", "TotalCharges": np.float},
    )
)