Added the shred method for deleting objects and containers perman…

…ently.
Pylons · Jan 30, 2012 · c17b7fa · c17b7fa
1 parent fdf4096
commit c17b7fa
Show file tree

Hide file tree

Showing 7 changed files with 338 additions and 20 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,8 @@
+1.2 (2012-01-30)
+----------------
+
+- Added the ``shred`` method for deleting objects and containers permanently.
+
 1.1 (2012-01-24)
 ----------------
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -45,16 +45,16 @@
 
 # General information about the project.
 project = u'Repozitory'
-copyright = u'2011, Shane Hathaway'
+copyright = u'2011-2012, Shane Hathaway'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '1.0'
+version = '1.2'
 # The full version, including alpha/beta/rc tags.
-release = '1.0'
+release = '1.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/repozitory/archive.py b/repozitory/archive.py
@@ -23,10 +23,13 @@
 from zope.sqlalchemy import ZopeTransactionExtension
 import datetime
 import hashlib
+import logging
 import tempfile
 
 _global_sessions = {}  # {db_string: SQLAlchemy session}
 
+log = logging.getLogger(__name__)
+
 
 def forget_sessions():
     _global_sessions.clear()
@@ -278,7 +281,7 @@ def history(self, docid, only_current=False):
             for row in rows]
 
     def get_version(self, docid, version_num):
-        """Return a specific IObjectHistoryRecord for a document.
+        """Return a specific IObjectHistoryRecord for an object.
         """
         created = (self.session.query(ArchivedObject.created)
             .filter_by(docid=docid)
@@ -531,7 +534,7 @@ def which_contain_deleted(self, container_ids, max_depth=None):
                     session.query(ArchivedItem.docid)
                     .filter(ArchivedItem.docid.in_(docids))
                     .all())
-                # For each deleted item, add to the list of results
+                # For each deleted (not moved) item, add to the list of results
                 # and remove from the set of containers to examine further.
                 for (container_id, docid) in deleted_rows:
                     if docid not in moved:
@@ -568,6 +571,109 @@ def which_contain_deleted(self, container_ids, max_depth=None):
 
         return res
 
+    def shred(self, docids=(), container_ids=()):
+        """Delete the specified objects and containers permanently.
+
+        The objects to shred must not exist in any container
+        (exempting the containers to be shredded) and the
+        containers must not contain any objects (exempting the
+        objects to be shredded). If these rules are not met, this
+        method will raise a ValueError.
+        """
+        session = self.session
+        conflicting_item = None
+
+        if docids:
+            # Verify none of the objects exist in any container
+            # (except the containers to be shredded.)
+            q = session.query(ArchivedItem).filter(
+                ArchivedItem.docid.in_(docids))
+            if container_ids:
+                q = q.filter(~ArchivedItem.container_id.in_(container_ids))
+            conflicting_item = q.order_by(ArchivedItem.docid).first()
+
+        if container_ids and conflicting_item is None:
+            # Verify none of the containers contain any objects
+            # (except the objects to be shredded.)
+            q = session.query(ArchivedItem).filter(
+                ArchivedItem.container_id.in_(container_ids))
+            if docids:
+                q = q.filter(~ArchivedItem.docid.in_(docids))
+            conflicting_item = q.order_by(ArchivedItem.container_id).first()
+
+        if conflicting_item is not None:
+            raise ValueError("Document %d is still in container %d" % (
+                conflicting_item.docid, conflicting_item.container_id))
+
+        # List the blob_ids referenced by the objects to shred.
+        # (Later, orphaned blobs will also be shredded.)
+        if docids:
+            blob_id_rows = (session.query(ArchivedBlobLink.blob_id)
+                .filter(ArchivedBlobLink.docid.in_(docids))
+                .all())
+            blob_ids = set(blob_id for (blob_id,) in blob_id_rows)
+        else:
+            blob_ids = None
+
+        if container_ids:
+            # Shred the specified containers.
+            # (Although we could rely on cascading, it seems useful to
+            # delete the rows explicitly to prevent accidents.)
+            log.warning("Shredding containers: %s", container_ids)
+            (session.query(ArchivedItem)
+                .filter(ArchivedItem.container_id.in_(container_ids))
+                .delete(False))
+            (session.query(ArchivedItemDeleted)
+                .filter(ArchivedItemDeleted.container_id.in_(container_ids))
+                .delete(False))
+            (session.query(ArchivedContainer)
+                .filter(ArchivedContainer.container_id.in_(container_ids))
+                .delete(False))
+
+        if docids:
+            # Shred the specified objects.
+            log.warning("Shredding objects: %s", container_ids)
+            (session.query(ArchivedItem)
+                .filter(ArchivedItem.docid.in_(docids))
+                .delete(False))
+            (session.query(ArchivedItemDeleted)
+                .filter(ArchivedItemDeleted.docid.in_(docids))
+                .delete(False))
+            (session.query(ArchivedCurrent)
+                .filter(ArchivedCurrent.docid.in_(docids))
+                .delete(False))
+            (session.query(ArchivedState)
+                .filter(ArchivedState.docid.in_(docids))
+                .delete(False))
+            (session.query(ArchivedBlobLink)
+                .filter(ArchivedBlobLink.docid.in_(docids))
+                .delete(False))
+            (session.query(ArchivedObject)
+                .filter(ArchivedObject.docid.in_(docids))
+                .delete(False))
+
+        if blob_ids:
+            keep_blob_rows = (session.query(ArchivedBlobLink.blob_id)
+                .filter(ArchivedBlobLink.blob_id.in_(blob_ids))
+                .all())
+            keep_blob_ids = set(blob_id for (blob_id,) in keep_blob_rows)
+            orphaned_blob_ids = blob_ids.difference(keep_blob_ids)
+
+            if orphaned_blob_ids:
+                # Shred the orphaned blobs.
+                log.warning("Shredding orphaned blobs: %s", orphaned_blob_ids)
+                (session.query(ArchivedChunk)
+                    .filter(ArchivedChunk.blob_id.in_(orphaned_blob_ids))
+                    .delete(False))
+                (session.query(ArchivedBlobInfo)
+                    .filter(ArchivedBlobInfo.blob_id.in_(orphaned_blob_ids))
+                    .delete(False))
+
+        # Above, we use delete(False) for speed. According to the
+        # SQLAlchemy docs, we should call expire_all() after
+        # using delete(False).
+        session.expire_all()
+
 
 class ObjectHistoryRecord(object):
     implements(IObjectHistoryRecord)

diff --git a/repozitory/interfaces.py b/repozitory/interfaces.py
@@ -136,7 +136,7 @@ def filter_container_ids(container_ids):
         Returns a sequence containing a subset of the provided container_ids.
         """
 
-    def which_contain_deleted(self, container_ids, max_depth=None):
+    def which_contain_deleted(container_ids, max_depth=None):
         """Returns the subset of container_ids that have had something deleted.
 
         This is useful for building a hierarchical trash UI that allows
@@ -150,6 +150,18 @@ def which_contain_deleted(self, container_ids, max_depth=None):
         (Most other methods make no such assumption.)
         """
 
+    def shred(docids=(), container_ids=()):
+        """Delete the specified objects and containers permanently.
+
+        The objects to shred must not exist in any container
+        (exempting the containers to be shredded) and the
+        containers must not contain any objects (exempting the
+        objects to be shredded). If these rules are not met, this
+        method will raise a ValueError.
+
+        Returns None.
+        """
+
 
 class IObjectVersion(IDCDescriptiveProperties, IDCTimes):
     """The content of an object for version control.

diff --git a/repozitory/schema.py b/repozitory/schema.py
@@ -116,7 +116,7 @@ class ArchivedBlobLink(Base):
     version_num = Column(Integer, primary_key=True, nullable=False)
     name = Column(Unicode, primary_key=True, nullable=False)
     blob_id = Column(Integer, ForeignKey('archived_blob_info.blob_id'),
-        nullable=True)
+        nullable=True, index=True)
 
     __table_args__ = (
         ForeignKeyConstraint(