<a href="https://colab.research.google.com/github/Superkart/Pandas_Provenance/blob/main/ProvenanceOnPandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Purpose
- This notebook initializes the setup for a Pandas Provenance Tracker.
- The goal is to manually track the provenance of data transformations performed on pandas DataFrames.
- Provenance tracking allows for transparency, reproducibility, and accountability in data workflows by maintaining a detailed log of operations and changes to data over time.

# 1. Importing Libraries


In [132]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

# 2. Mounting Drive
Here we will be mounting our google drive, so that we can access tables and later on store the Provenance Tables

In [133]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 3. Provenance Storage

In [134]:
Provenance_File_Path = '/content/drive/My Drive/Pandas_Provenance/provenance_log.csv'

# 4. Persistant Provenance



In [135]:
if os.path.exists(Provenance_File_Path):
    provenance_store = pd.read_csv(Provenance_File_Path)
else:
    provenance_store = pd.DataFrame(columns=["table_id", "operation", "details", "shape", "timestamp"])
    provenance_store.to_csv(Provenance_File_Path, index=False)

def save_provenance_to_file():
    provenance_store.to_csv(Provenance_File_Path, index=False)

# 5. Custom Provenance Class

In [136]:
class ProvenanceDataFrame(pd.DataFrame):
    _metadata = ["prov_table"]

    def __init__(self, *args, prov_table=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.prov_table = prov_table or pd.DataFrame(columns=["operation", "details", "shape", "timestamp"])

    @property
    def _constructor(self):
        def wrapped_constructor(*args, **kwargs):
            return ProvenanceDataFrame(*args, prov_table=self.prov_table, **kwargs)
        return wrapped_constructor

    def log_provenance(self, operation, details):

        timestamp = datetime.now().isoformat()
        shape = self.shape
        new_entry = {
            "table_id": id(self),
            "operation": operation,
            "details": details,
            "shape": shape,
            "timestamp": timestamp,
        }

        self.prov_table = pd.concat([self.prov_table, pd.DataFrame([new_entry])], ignore_index=True)

        global provenance_store
        provenance_store = pd.concat([provenance_store, pd.DataFrame([new_entry])], ignore_index=True)
        save_provenance_to_file()

    def get_provenance(self):

        return self.prov_table

### OVERRIDING METHODS

    @staticmethod
    def read_csv(filepath):

        df = pd.read_csv(filepath)
        prov_df = ProvenanceDataFrame(df)
        prov_df.log_provenance("read_csv", f"Loaded CSV file: {filepath}")
        return prov_df

    def __getitem__(self, key):

      result = super().__getitem__(key)
      if isinstance(result, pd.DataFrame):
          new_df = ProvenanceDataFrame(result, self.prov_table.copy())
          if isinstance(key, pd.Series):
              new_df.log_provenance("selection", f"Row filtering with condition: {key}")
          return new_df
      return result

    def loc(self, *args, **kwargs):
        result = super(ProvenanceDataFrame, self).loc(*args, **kwargs)
        if isinstance(result, pd.DataFrame):
            new_df = ProvenanceDataFrame(result, self.prov_table.copy())
            new_df.log_provenance("projection", f"Projected columns with loc: {args}, {kwargs}")
            return new_df
        return result

    def __setitem__(self, key, value):

      self.log_provenance("modification", f"Modified/added column: {key}")
      super().__setitem__(key, value)




# 6. Overriding Pandas Methods
Here we will start overriding the standard methods of pandas and add provenance tracking to it

## 6.1 Selection - __getitem__()

## 6.3 Projection - **loc()**

In [137]:
"""
def loc(self, *args, **kwargs):

    result = pd.DataFrame.loc(self, *args, **kwargs)
    if isinstance(result, pd.DataFrame):
        # Handle as a ProvenanceDataFrame if the result is a DataFrame
        new_df = ProvenanceDataFrame(result, self.prov_table.copy())
        new_df.log_provenance("projection", f"Projected columns with loc: {args}, {kwargs}")
        return new_df
    return result
"""

'\ndef loc(self, *args, **kwargs):\n\n    result = pd.DataFrame.loc(self, *args, **kwargs)  \n    if isinstance(result, pd.DataFrame):\n        # Handle as a ProvenanceDataFrame if the result is a DataFrame\n        new_df = ProvenanceDataFrame(result, self.prov_table.copy())\n        new_df.log_provenance("projection", f"Projected columns with loc: {args}, {kwargs}")\n        return new_df\n    return result\n'

# 6.4 Modification - **__setitem__()**

In [138]:
"""
def __setitem__(self, key, value):

    self.log_provenance("modification", f"Modified/added column: {key}")
    pd.DataFrame.__setitem__(self, key, value)
"""

'\ndef __setitem__(self, key, value):\n  \n    self.log_provenance("modification", f"Modified/added column: {key}")\n    pd.DataFrame.__setitem__(self, key, value)\n'

# 6. Getting Data From CSV


In [139]:
prov_df = ProvenanceDataFrame.read_csv('/content/drive/My Drive/Pandas_Provenance/color_srgb.csv')


# Testing

In [140]:
print("Loaded DataFrame:")
print(prov_df)

"""
selected_df = prov_df.loc[prov_df['Name'] == 'Red']
print("\nSelected DataFrame:")
print(selected_df)
"""
projected_df = prov_df.loc[:, ['Name', 'HEX']]
print("\nProjected DataFrame:")
print(projected_df)

selected_df_2 = prov_df[prov_df['HEX'] == '#FF0000']
print("\nSelected DataFrame (HEX = #FF0000):")
print(selected_df_2)

print("\nProvenance Log:")
print(prov_df.get_provenance())

Loaded DataFrame:
       Name      HEX               RGB
0     White  #FFFFFF  rgb(100,100,100)
1    Silver  #C0C0C0     rgb(75,75,75)
2      Gray  #808080     rgb(50,50,50)
3     Black  #000000        rgb(0,0,0)
4       Red  #FF0000      rgb(100,0,0)
5    Maroon  #800000       rgb(50,0,0)
6    Yellow  #FFFF00    rgb(100,100,0)
7     Olive  #808000      rgb(50,50,0)
8      Lime  #00FF00      rgb(0,100,0)
9     Green  #008000       rgb(0,50,0)
10     Aqua  #00FFFF    rgb(0,100,100)
11     Teal  #008080      rgb(0,50,50)
12     Blue  #0000FF      rgb(0,0,100)
13     Navy  #000080       rgb(0,0,50)
14  Fuchsia  #FF00FF    rgb(100,0,100)
15   Purple  #800080      rgb(50,0,50)


TypeError: 'method' object is not subscriptable