<a href="https://colab.research.google.com/github/Superkart/Pandas_Provenance/blob/main/ProvenanceOnPandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Purpose
- This notebook initializes the setup for a Pandas Provenance Tracker.
- The goal is to manually track the provenance of data transformations performed on pandas DataFrames.
- Provenance tracking allows for transparency, reproducibility, and accountability in data workflows by maintaining a detailed log of operations and changes to data over time.

# 1. Importing Libraries


In [23]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

# 2. Mounting Drive
Here we will be mounting our google drive, so that we can access tables and later on store the Provenance Tables

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 3. Provenance Storage

In [25]:
PROVENANCE_LOG_PATH = '/content/drive/My Drive/Pandas_Provenance/provenance_log.csv'

# 4. Custom Provenance Class

In [26]:
class ProvenanceDataFrame:
    def __init__(self, data, provenance_path=PROVENANCE_LOG_PATH):
        self.data = data
        self.provenance_path = provenance_path
        self.prov_table = self._load_provenance()

    def _load_provenance(self):
        # Load existing provenance table or create a new one
        if os.path.exists(self.provenance_path):
            return pd.read_csv(self.provenance_path)
        else:
            return pd.DataFrame(columns=["operation", "details", "shape", "timestamp", "reason"])

    def _save_provenance(self):
        # Save provenance table to file
        self.prov_table.to_csv(self.provenance_path, index=False)

    def add_provenance_entry(self, operation, details, shape, reason=""):
        # Add a new entry to the provenance table
        entry = {
            "operation": operation,
            "details": details,
            "shape": shape,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "reason": reason
        }
        self.prov_table = self.prov_table.append(entry, ignore_index=True)
        self._save_provenance()

    def get_provenance(self):

      return self.prov_table



# 5. Overriding Pandas Methods
Here we will start overriding the standard methods of pandas and add provenance tracking to it

## 5.1 read_csv()

In [27]:
    @staticmethod
    def read_csv(filepath, provenance_path=PROVENANCE_LOG_PATH):
        # Custom read_csv method to initialize ProvenanceDataFrame
        df = pd.read_csv(filepath)
        return ProvenanceDataFrame(df, provenance_path=provenance_path)

## 5.2 Selection - loc()

In [38]:
    def loc(self, condition, reason=""):
        # Override loc to track provenance for selection
        selected_data = self.data.loc[condition]
        details = f"Condition: {condition}"
        shape = selected_data.shape
        self.add_provenance_entry("selection", details, shape, reason)
        return ProvenanceDataFrame(selected_data, provenance_path=self.provenance_path)


## 5.3 Projection - getitem

In [29]:
    def __getitem__(self, columns):
        # Override __getitem__ for column projection
        if isinstance(columns, list):
            projected_data = self.data[columns]
            details = f"Columns: {columns}"
            shape = projected_data.shape
            self.add_provenance_entry("projection", details, shape)
            return ProvenanceDataFrame(projected_data, provenance_path=self.provenance_path)
        else:
            return self.data[columns]


# 6. Getting Data From CSV


In [36]:
# Initialize the ProvenanceDataFrame
prov_df = read_csv('/content/drive/My Drive/Pandas_Provenance/color_srgb.csv', PROVENANCE_LOG_PATH)


# Testing

In [39]:
# Selection with loc
selected_df = prov_df.loc(prov_df.data['age'] > 30, reason="Filter rows where age > 30")

# Projection (column selection)
projected_df = prov_df[['name', 'age']]

# View Provenance Table
print("Provenance Table:")
print(prov_df.prov_table)


AttributeError: 'ProvenanceDataFrame' object has no attribute 'loc'