<a href="https://colab.research.google.com/github/Teasotea/CodingProblems/blob/main/Python_OOP_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Instructions:

Fill in the methods of the DataCleaner class to produce the same printed results
as in the comments below. Good luck, and have fun!
"""
from __future__ import annotations

import numpy as np
import pandas as pd

from typing import Dict, Any, List

In [135]:
class DataCleaner:
    """
    Transform a pandas df while keeping track of the history of transformations to
    allow reverting back to earlier state.
    """
    def __init__(self, df: pd.DataFrame):
        self.current = df
        self.history = [('Initial df', self.current.copy())]

    def adjust_dtype(self, types: Dict[str, Any]) -> None:
        self.current = self.current.astype(types)
        self.history.append(('Adjusted dtypes using {types}'.format(types=types), self.current.copy()))

    def impute_missing(self, columns: List[str]) -> None:
        self.current[columns] = self.current[columns].apply(lambda x: x.fillna(x.mean()), axis = 0)
        self.history.append(('Imputed missing in {cols}'.format(cols=columns), self.current.copy()))
      
    def revert(self, steps_back: int = 1) -> None:
        if steps_back < len(self.history):
          self.current = self.history[-steps_back+1][1]
          self.history = self.history[:-steps_back+1]

    def save(self, path: str) -> None:
        self.current.to_csv(path, index=False)

    @staticmethod
    def load(path: str) -> DataCleaner:
        return DataCleaner(pd.read_csv(path))


In [136]:
transactions = pd.DataFrame(
    {
        "customer_id": [10, 10, 13, 10, 11, 11, 10],
        "amount": [1.00, 1.31, 20.5, 0.5, 0.2, 0.2, np.nan],
        "timestamp": [
            "2020-10-08 11:32:01",
            "2020-10-08 13:45:00",
            "2020-10-07 05:10:30",
            "2020-10-08 12:30:00",
            "2020-10-07 01:29:33",
            "2020-10-08 13:45:00",
            "2020-10-09 02:05:21",
        ]
    }
)

In [137]:

transactions_dc = DataCleaner(transactions)

print(f"Current dataframe:\n{transactions_dc.current}")

Current dataframe:
   customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21


In [138]:
print(f"Initial dtypes:\n{transactions_dc.current.dtypes}")

# Initial dtypes:
# customer_id      int64
# amount         float64
# timestamp       object
# dtype: object


Initial dtypes:
customer_id      int64
amount         float64
timestamp       object
dtype: object


In [139]:
transactions_dc.adjust_dtype({"timestamp": np.datetime64})

print(f"Changed dtypes to:\n{transactions_dc.current.dtypes}")

# Changed dtypes to:
# customer_id             int64
# amount                float64
# timestamp      datetime64[ns]

Changed dtypes to:
customer_id             int64
amount                float64
timestamp      datetime64[ns]
dtype: object


In [140]:
transactions_dc.impute_missing(columns=["amount"])

print(f"Imputed missing as mean:\n{transactions_dc.current}")

# Imputed missing as mean:
#    customer_id     amount           timestamp
# 0           10   1.000000 2020-10-08 11:32:01
# 1           10   1.310000 2020-10-08 13:45:00
# 2           13  20.500000 2020-10-07 05:10:30
# 3           10   0.500000 2020-10-08 12:30:00
# 4           11   0.200000 2020-10-07 01:29:33
# 5           11   0.200000 2020-10-08 13:45:00
# 6           10   3.951667 2020-10-09 02:05:21


Imputed missing as mean:
   customer_id     amount           timestamp
0           10   1.000000 2020-10-08 11:32:01
1           10   1.310000 2020-10-08 13:45:00
2           13  20.500000 2020-10-07 05:10:30
3           10   0.500000 2020-10-08 12:30:00
4           11   0.200000 2020-10-07 01:29:33
5           11   0.200000 2020-10-08 13:45:00
6           10   3.951667 2020-10-09 02:05:21


In [141]:
print(f"History of changes:\n{transactions_dc.history}")

# ** Any coherent structure with history of changes **
# E.g., here's one possibility

# History of changes:
# [('Initial df',    customer_id  amount            timestamp
# 0           10    1.00  2020-10-08 11:32:01
# 1           10    1.31  2020-10-08 13:45:00
# 2           13   20.50  2020-10-07 05:10:30
# 3           10    0.50  2020-10-08 12:30:00
# 4           11    0.20  2020-10-07 01:29:33
# 5           11    0.20  2020-10-08 13:45:00
# 6           10     NaN  2020-10-09 02:05:21), ("Adjusted dtypes using {'timestamp': <class 'numpy.datetime64'>}",    customer_id  amount           timestamp
# 0           10    1.00 2020-10-08 11:32:01
# 1           10    1.31 2020-10-08 13:45:00
# 2           13   20.50 2020-10-07 05:10:30
# 3           10    0.50 2020-10-08 12:30:00
# 4           11    0.20 2020-10-07 01:29:33
# 5           11    0.20 2020-10-08 13:45:00
# 6           10     NaN 2020-10-09 02:05:21), ("Imputed missing in ['amount']",    customer_id     amount           timestamp
# 0           10   1.000000 2020-10-08 11:32:01
# 1           10   1.310000 2020-10-08 13:45:00
# 2           13  20.500000 2020-10-07 05:10:30
# 3           10   0.500000 2020-10-08 12:30:00
# 4           11   0.200000 2020-10-07 01:29:33
# 5           11   0.200000 2020-10-08 13:45:00
# 6           10   3.951667 2020-10-09 02:05:21)]


History of changes:
[('Initial df',    customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21), ("Adjusted dtypes using {'timestamp': <class 'numpy.datetime64'>}",    customer_id  amount           timestamp
0           10    1.00 2020-10-08 11:32:01
1           10    1.31 2020-10-08 13:45:00
2           13   20.50 2020-10-07 05:10:30
3           10    0.50 2020-10-08 12:30:00
4           11    0.20 2020-10-07 01:29:33
5           11    0.20 2020-10-08 13:45:00
6           10     NaN 2020-10-09 02:05:21), ("Imputed missing in ['amount']",    customer_id     amount           timestamp
0           10   1.000000 2020-10-08 11:32:01
1           10   1.310000 2020-10-08 13:45:00
2           13  20.50000

In [142]:
transactions_dc.save("transactions")
loaded_dc = DataCleaner.load("transactions")
print(f"Loaded DataCleaner current df:\n{loaded_dc.current}")

# Loaded DataCleaner current df:
#    customer_id     amount           timestamp
# 0           10   1.000000 2020-10-08 11:32:01
# 1           10   1.310000 2020-10-08 13:45:00
# 2           13  20.500000 2020-10-07 05:10:30
# 3           10   0.500000 2020-10-08 12:30:00
# 4           11   0.200000 2020-10-07 01:29:33
# 5           11   0.200000 2020-10-08 13:45:00
# 6           10   3.951667 2020-10-09 02:05:21



Loaded DataCleaner current df:
   customer_id     amount            timestamp
0           10   1.000000  2020-10-08 11:32:01
1           10   1.310000  2020-10-08 13:45:00
2           13  20.500000  2020-10-07 05:10:30
3           10   0.500000  2020-10-08 12:30:00
4           11   0.200000  2020-10-07 01:29:33
5           11   0.200000  2020-10-08 13:45:00
6           10   3.951667  2020-10-09 02:05:21


In [147]:
transactions_dc.revert()
print(f"Reverting missing value imputation:\n{transactions_dc.current}")

# Reverting missing value imputation:
#    customer_id  amount           timestamp
# 0           10    1.00 2020-10-08 11:32:01
# 1           10    1.31 2020-10-08 13:45:00
# 2           13   20.50 2020-10-07 05:10:30
# 3           10    0.50 2020-10-08 12:30:00
# 4           11    0.20 2020-10-07 01:29:33
# 5           11    0.20 2020-10-08 13:45:00
# 6           10     NaN 2020-10-09 02:05:21

Reverting missing value imputation:
   customer_id  amount            timestamp
0           10    1.00  2020-10-08 11:32:01
1           10    1.31  2020-10-08 13:45:00
2           13   20.50  2020-10-07 05:10:30
3           10    0.50  2020-10-08 12:30:00
4           11    0.20  2020-10-07 01:29:33
5           11    0.20  2020-10-08 13:45:00
6           10     NaN  2020-10-09 02:05:21
