In [51]:
from __future__ import annotations

import numpy as np
import pandas as pd

from typing import Dict, Any, List

In [52]:
class DataCleaner:
    """
    Transform a pandas df while keeping track of the history of transformations to
    allow reverting back to earlier state.
    """
    def __init__(self, df: pd.DataFrame):
        self.current = df
        self.history = [('Initial df', self.current.copy())]

    def adjust_dtype(self, types: Dict[str, Any]) -> None:
        key = [key for key in types][0]
        value = types[key]
        self.current[key] = self.current[key].astype(value)
        self.history.append((f"Adjusted dtypes using {types}", self.current.copy()))

    def impute_missing(self, columns: List[str]) -> None:
        column_mean = self.current[columns].mean()
        self.current.fillna(value=column_mean, inplace=True)
        self.history.append((f"Imputed missing in {columns}", self.current.copy()))

    def revert(self, steps_back: int = 1) -> None:
        message = self.history[-steps_back][0]
        self.current = self.history[-(steps_back + 1)][1]
        self.history = self.history[0:-steps_back]
        print(f'Reverting {message}.')

    def save(self, path: str) -> None:
        self.current.to_pickle(path)

    @staticmethod
    def load(path: str) -> DataCleaner:
        loaded_df = pd.read_pickle(path)
        return DataCleaner(loaded_df)

In [53]:
transactions = pd.DataFrame(
    {
        "customer_id": [10, 10, 13, 10, 11, 11, 10],
        "amount": [1.00, 1.31, 20.5, 0.5, 0.2, 0.2, np.nan],
        "timestamp": [
            "2020-10-08 11:32:01",
            "2020-10-08 13:45:00",
            "2020-10-07 05:10:30",
            "2020-10-08 12:30:00",
            "2020-10-07 01:29:33",
            "2020-10-08 13:45:00",
            "2020-10-09 02:05:21",
        ]
    }
)

In [54]:
transactions_dc = DataCleaner(transactions)

print(f"Current dataframe:\n{transactions_dc.current}")

Current dataframe:
   customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21


In [55]:
print(f"Current dtypes:\n{transactions_dc.current.dtypes}")

Current dtypes:
customer_id      int64
amount         float64
timestamp       object
dtype: object


In [56]:
transactions_dc.adjust_dtype({"timestamp": np.datetime64})

In [57]:
print(f"Current dtypes:\n{transactions_dc.current.dtypes}")

Current dtypes:
customer_id             int64
amount                float64
timestamp      datetime64[ns]
dtype: object


In [58]:
transactions_dc.impute_missing(columns=["amount"])

print(f"Imputed missing as overall mean:\n{transactions_dc.current}")

Imputed missing as overall mean:
   customer_id     amount           timestamp
0           10   1.000000 2020-10-08 11:32:01
1           10   1.310000 2020-10-08 13:45:00
2           13  20.500000 2020-10-07 05:10:30
3           10   0.500000 2020-10-08 12:30:00
4           11   0.200000 2020-10-07 01:29:33
5           11   0.200000 2020-10-08 13:45:00
6           10   3.951667 2020-10-09 02:05:21


In [59]:
print(f"History of changes:\n{transactions_dc.history}")

History of changes:
[('Initial df',    customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21), ("Adjusted dtypes using {'timestamp': <class 'numpy.datetime64'>}",    customer_id  amount           timestamp
0           10    1.00 2020-10-08 11:32:01
1           10    1.31 2020-10-08 13:45:00
2           13   20.50 2020-10-07 05:10:30
3           10    0.50 2020-10-08 12:30:00
4           11    0.20 2020-10-07 01:29:33
5           11    0.20 2020-10-08 13:45:00
6           10     NaN 2020-10-09 02:05:21), ("Imputed missing in ['amount']",    customer_id     amount           timestamp
0           10   1.000000 2020-10-08 11:32:01
1           10   1.310000 2020-10-08 13:45:00
2           13  20.50000

In [60]:
transactions_dc.save("transactions")
loaded_dc = DataCleaner.load("transactions")
print(f"Loaded DataCleaner current df:\n{loaded_dc.current}")

Loaded DataCleaner current df:
   customer_id     amount           timestamp
0           10   1.000000 2020-10-08 11:32:01
1           10   1.310000 2020-10-08 13:45:00
2           13  20.500000 2020-10-07 05:10:30
3           10   0.500000 2020-10-08 12:30:00
4           11   0.200000 2020-10-07 01:29:33
5           11   0.200000 2020-10-08 13:45:00
6           10   3.951667 2020-10-09 02:05:21


In [61]:
transactions_dc.revert()
print(f"Reverting missing value imputation:\n{transactions_dc.current}")

Reverting Imputed missing in ['amount'].
Reverting missing value imputation:
   customer_id  amount           timestamp
0           10    1.00 2020-10-08 11:32:01
1           10    1.31 2020-10-08 13:45:00
2           13   20.50 2020-10-07 05:10:30
3           10    0.50 2020-10-08 12:30:00
4           11    0.20 2020-10-07 01:29:33
5           11    0.20 2020-10-08 13:45:00
6           10     NaN 2020-10-09 02:05:21


In [62]:
transactions_dc.history

[('Initial df',
     customer_id  amount            timestamp
  0           10    1.00  2020-10-08 11:32:01
  1           10    1.31  2020-10-08 13:45:00
  2           13   20.50  2020-10-07 05:10:30
  3           10    0.50  2020-10-08 12:30:00
  4           11    0.20  2020-10-07 01:29:33
  5           11    0.20  2020-10-08 13:45:00
  6           10     NaN  2020-10-09 02:05:21),
 ("Adjusted dtypes using {'timestamp': <class 'numpy.datetime64'>}",
     customer_id  amount           timestamp
  0           10    1.00 2020-10-08 11:32:01
  1           10    1.31 2020-10-08 13:45:00
  2           13   20.50 2020-10-07 05:10:30
  3           10    0.50 2020-10-08 12:30:00
  4           11    0.20 2020-10-07 01:29:33
  5           11    0.20 2020-10-08 13:45:00
  6           10     NaN 2020-10-09 02:05:21)]

In [63]:
transactions_dc.revert()


Reverting Adjusted dtypes using {'timestamp': <class 'numpy.datetime64'>}.


In [64]:
transactions_dc.history

[('Initial df',
     customer_id  amount            timestamp
  0           10    1.00  2020-10-08 11:32:01
  1           10    1.31  2020-10-08 13:45:00
  2           13   20.50  2020-10-07 05:10:30
  3           10    0.50  2020-10-08 12:30:00
  4           11    0.20  2020-10-07 01:29:33
  5           11    0.20  2020-10-08 13:45:00
  6           10     NaN  2020-10-09 02:05:21)]