# Versioning

The purpose of this Jupyter Notebook is to add versioning to a family tree of datasets

In [57]:
from json import dumps, loads
from datetime import datetime
from collections import OrderedDict
from datetime import datetime

## Sample dataset

The purpose of this notebook is to show how versioning can be added to the family tree code received from the backend. Here is an image of the sample family tree below.

<img src="family_tree_img.png" alt="Family Tree" width="700"/>

Here is the json associated with that family tree

In [71]:
ft = {
  "32463890-4f0f-43b9-a697-86ced79c166d": {
    "1": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": None,
      "operation_description": "create",
      "timestamp": "2024-09-05 13:23:30"
    },
    "2": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "1",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:23:55"
    },
    "3.0": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "2",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:24:17"
    },
    "4.0": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "3.0",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:27:17"
    },
    "5.0.0": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "4.0",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:24:55"
    },
    "6.0.0": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "5.0.0",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:25:10"
    }
  },
  "25178878-39c1-428f-a31b-d7cf723cc66f": {
    "5.0.2": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "4.0",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:26:19"
    },
    "6.0.2": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "5.0.2",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:26:36"
    }
  },
  "3b79d3e3-2455-4e98-a36a-cad468b6e660": {
    "5.0.1": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "4.0",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:25:34"
    },
    "6.0.1": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "5.0.1",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:25:52"
    }
  },
  "66420a35-17a5-49e3-a557-452629e3d48d": {
    "3.1": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "2",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:27:17"
    },
    "4.1": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "3.1",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:27:00"
    }
  },
  "4687cfdd-d03a-4c14-a8a9-6c294ad096ff": {
    "3.2": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "2",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:27:56"
    },
    "4.2.0": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "3.2",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:28:07"
    },
    "5.2.0": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "4.2.0",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:28:29"
    }
  },
  "7d0bf9c0-13ef-4545-8996-8e0fd54c61a7": {
    "4.2.1": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "3.2",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:28:45"
    },
    "5.2.1": {
      "by": "user1",
      "dataset_name": "random_name",
      "derived_from": "4.2.1",
      "operation_description": "update",
      "timestamp": "2024-09-05 13:29:01"
    }
  }
}


## Add Versioning

Here is the new approach to adding versioning in Python:
1. Create Python classes that allow for a tree data structure
2. Add each of these elements to the tree data structure, specifying parents and children
3. Iterate through tree and add versioning using BFS

### Family Tree Data Structure

In [72]:
class FamilyTree:
    '''
    Family Tree class that contains a dictionary representation of the family tree of datasets
    '''
    def __init__(self):
        self.nodes = {}
        self.root = None

    def add_dataset(self, dataset):
        '''
        Adds dataset to nodes dictionary, story by its dataset_id
        '''
        dataset_id = getattr(dataset, 'dataset_id')
        if getattr(dataset, 'dataset_id') not in self.nodes:
            self.nodes[dataset_id] = dataset

    def update_versioning(self):
        '''
        Adds version numbers to nodes
        Uses BFS to iterate through all nodes and add correct version numbers to each node
        '''
        # Start with root, which is named 1.0
        root = getattr(self, 'root')
        setattr(root, 'version', '1')
        
        # BFS children, add versions
        q = [root]
        while len(q) > 0:
            # Get/remove first element in q
            parent = q[0]
            parent_version = getattr(parent, 'version')
            q = q[1:]    
        
            # Add all children to q
            # NOTE: Make sure children are sorted by timestamp
            this_dataset_children_unsorted = getattr(parent, 'children')
            this_dataset_children = sorted(this_dataset_children_unsorted, key=lambda obj: datetime.strptime(obj.timestamp, '%Y-%m-%d %H:%M:%S'))
            num_children = len(this_dataset_children)
            for child in this_dataset_children:
                q.append(child)
        
            # Name versions accordingly, depending on number of children
            if num_children == 1:
                child = this_dataset_children[0]
                child_version = str((int(parent_version[0:1]) + 1)) + parent_version[1:]
                setattr(this_dataset_children[0], 'version', child_version)
            elif num_children > 1:
                for i, child in enumerate(this_dataset_children):
                    child_version = str((int(parent_version[0:1]) + 1)) + parent_version[1:] + '.' + str(i)
                    setattr(this_dataset_children[i], 'version', child_version)

    def get_version(self, dataset_id):
        '''
        Given the dataset_id, returns the version of that dataset
        '''
        version = getattr(self.nodes[dataset_id], 'version')
        return version
    
    def display_tree(self, dataset=None, level=0):
        '''
        Displays family tree
        '''
        if dataset is None:
            dataset = self.root
        if dataset is not None:
            print(' ' * 4 * level + '->', dataset)
            for child in dataset.children:
                self.display_tree(child, level + 1)

    def add_root(self, dataset):
        '''
        Notes a dataset as the root of the family tree
        A dataset is the root if it has no parents
        '''
        self.root = dataset

In [73]:
class Dataset:
    '''
    Dataset class that contains all dataset info and relationships to parent and children datasets
    Note that version is initialized with None, but later updated
    '''
    def __init__(self, dataset_id, username, dataset_name, derived_from, operation_description, timestamp):
        # Dataset attributes
        self.dataset_id = dataset_id
        self.by = username
        self.dataset_name = dataset_name
        self.derived_from = derived_from
        self.operation_description = operation_description
        self.timestamp = timestamp
        self.version = None
        # Tree attributes
        self.parents = []
        self.children = []

    def add_parent(self, parent):
        '''
        Adds parent to parents list
        Adds this dataset to parents' childrens list
        '''
        if parent not in self.parents:
            self.parents.append(parent)
        if self not in parent.children:
            parent.children.append(self)

    def add_child(self, child):
        '''
        Adds child to children list
        Adds this dataset to child's parents list
        '''
        if child not in self.children:
            self.children.append(child)
        if self not in child.parents:
            child.parents.append(self)

    def __repr__(self):
        return f"{self.dataset_id}: Version {self.version}"

In [74]:
def create_family_tree(ft):
    '''
    Function that creates a FamilyTree object from json
    Input: FamilyTree json object
    Output: FamilyTree object containing Dataset objects for each dataset
    '''
    # Iterate through all datasets in ft, create Dataset class for each
    ft_obj = FamilyTree()
    
    # Create Dataset for each dataset, add to ft_dict
    for lineage_id in ft:
        # print("lineage id: ", lineage_id)
    
        this_lineage = ft[lineage_id]
        for this_dataset_id in this_lineage:
            # print("This dataset id: ", this_dataset_id)
            
            this_dataset = this_lineage[this_dataset_id]
            # print("This dataset: ", this_lineage[this_dataset_id])
            
            this_dataset = Dataset(
                this_dataset_id, 
                this_dataset['by'], 
                this_dataset['dataset_name'], 
                this_dataset['derived_from'], 
                this_dataset['operation_description'], 
                this_dataset['timestamp']
            )
    
            ft_obj.add_dataset(this_dataset)
    
    # Add relationships between each Dataset in ft_dict
    ft_nodes = getattr(ft_obj, 'nodes')
    for dataset_id, dataset in getattr(ft_obj, 'nodes').items():
        derived_from = getattr(dataset, 'derived_from')
        if derived_from is not None:
            dataset.add_parent(ft_nodes[dataset.derived_from])
        else:
            ft_obj.add_root(dataset)

    # Update versioning
    ft_obj.update_versioning()

    return ft_obj

In [75]:
# Now check that these versions were updated.
ft_obj = create_family_tree(ft)
ft_obj.display_tree()

-> 1: Version 1
    -> 2: Version 2
        -> 3.0: Version 3.0
            -> 4.0: Version 4.0
                -> 5.0.0: Version 5.0.0
                    -> 6.0.0: Version 6.0.0
                -> 5.0.2: Version 5.0.2
                    -> 6.0.2: Version 6.0.2
                -> 5.0.1: Version 5.0.1
                    -> 6.0.1: Version 6.0.1
        -> 3.1: Version 3.1
            -> 4.1: Version 4.1
        -> 3.2: Version 3.2
            -> 4.2.0: Version 4.2.0
                -> 5.2.0: Version 5.2.0
            -> 4.2.1: Version 4.2.1
                -> 5.2.1: Version 5.2.1


### Add versioning back into json

In [12]:
def add_versioning(ft):
    '''
    Adds versioning to the family tree json
    Inputs: family tree json
    Output: family tree json with updated versioning
    '''
    ft_obj = create_family_tree(ft)
    ft_output = ft

    # Update version for each dataset
    for lineage_id in ft_output:
        this_lineage = ft_output[lineage_id]
        for this_dataset_id in this_lineage:
            this_version = ft_obj.get_version(this_dataset_id)    
            ft_output[lineage_id][this_dataset_id]['version'] = this_version

    return ft_output

In [13]:
ft_output = add_versioning(ft)
print(ft_output)

{'32463890-4f0f-43b9-a697-86ced79c166d': {'1.0': {'by': 'user1', 'dataset_name': 'random_name', 'derived_from': None, 'operation_description': 'create', 'timestamp': '2024-09-05 13:23:30', 'version': '1.0'}, '2.0': {'by': 'user1', 'dataset_name': 'random_name', 'derived_from': '1.0', 'operation_description': 'update', 'timestamp': '2024-09-05 13:23:55', 'version': '1.1'}, '3.0': {'by': 'user1', 'dataset_name': 'random_name', 'derived_from': '2.0', 'operation_description': 'update', 'timestamp': '2024-09-05 13:24:17', 'version': '1.1-0.0'}, '4.0': {'by': 'user1', 'dataset_name': 'random_name', 'derived_from': '3.0', 'operation_description': 'update', 'timestamp': '2024-09-05 13:27:17', 'version': '1.1-0.1'}, '5.0.0': {'by': 'user1', 'dataset_name': 'random_name', 'derived_from': '4.0', 'operation_description': 'update', 'timestamp': '2024-09-05 13:24:55', 'version': '1.1-0.1-0.0'}, '6.0.0': {'by': 'user1', 'dataset_name': 'random_name', 'derived_from': '5.0.0', 'operation_description': 