Moves the uploaded files to their own table.

OneGov · Jun 3, 2015 · 6b0cfd9 · 6b0cfd9
1 parent 203792a
commit 6b0cfd9
Show file tree

Hide file tree

Showing 7 changed files with 206 additions and 20 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -4,6 +4,9 @@ Changelog
 Unreleased
 ~~~~~~~~~~
 
+- Moves the uploaded files to their own table.
+  [href]
+
 0.3.1 (2015-06-02)
 ~~~~~~~~~~~~~~~~~~~
 

diff --git a/onegov/form/collection.py b/onegov/form/collection.py
@@ -1,8 +1,14 @@
 from delorean import Delorean
 from datetime import datetime, timedelta
 from onegov.core.utils import normalize_for_url
-from onegov.form.models import FormDefinition, FormSubmission
-from sqlalchemy import inspect
+from onegov.form.fields import UploadField
+from onegov.form.models import (
+    FormDefinition,
+    FormSubmission,
+    FormSubmissionFile
+)
+from sqlalchemy import inspect, not_
+from uuid import uuid4
 
 
 class FormCollection(object):
@@ -94,19 +100,13 @@ def add(self, form_name, form, state):
 
         # look up the right class depending on the type
         _mapper = inspect(FormSubmission).polymorphic_map.get(state)
+
         submission = (_mapper and _mapper.class_ or FormSubmission)()
+        submission.id = uuid4()
         submission.name = form_name
-        submission.definition = form._source
-        submission.data = form.data
         submission.state = state
 
-        # pending submissions are not necessarily valid, however we don't need
-        # to store invalid state as it is wiped out anyway
-        submission.prune(form)
-
-        # never include the csrf token
-        if form.meta.csrf and form.meta.csrf_field_name in submission.data:
-            del submission.data[form.meta.csrf_field_name]
+        self.update(submission, form)
 
         self.session.add(submission)
         self.session.flush()
@@ -120,6 +120,71 @@ def add(self, form_name, form, state):
 
         return submission
 
+    def update(self, submission, form):
+        """ Takes a submission and a form and updates the submission data
+        as well as the files stored in a spearate table.
+
+        """
+        assert submission.id and submission.state
+
+        submission.definition = form._source
+        submission.data = form.data
+
+        # pending submissions are not necessarily valid, however we don't need
+        # to store invalid state as it is wiped out anyway
+        submission.prune(form)
+
+        # never include the csrf token
+        if form.meta.csrf and form.meta.csrf_field_name in submission.data:
+            del submission.data[form.meta.csrf_field_name]
+
+        # move uploaded files to a separate table
+        files = set(
+            field_id for field_id, field in form._fields.items()
+            if isinstance(field, UploadField)
+        )
+
+        files_to_remove = set(
+            id for id in files
+            if submission.data.get(id) == {}
+        )
+
+        files_to_add = set(
+            id for id in (files - files_to_remove)
+            if submission.data.get(id)
+            and not submission.data[id]['data'].startswith('@')
+        )
+
+        files_to_keep = files - files_to_remove - files_to_add
+
+        # delete all files which are not part of the updated form
+        # if no files are given, delete all files belonging to the submission
+        query = self.session.query(FormSubmissionFile)
+        query = query.filter(FormSubmissionFile.submission_id == submission.id)
+
+        if files_to_keep:
+            query = query.filter(not_(
+                FormSubmissionFile.field_id.in_(files_to_keep)))
+
+        query.delete('fetch')
+
+        # store the new fields in the separate table
+        for field_id in files_to_add:
+            f = FormSubmissionFile(
+                id=uuid4(),
+                field_id=field_id,
+                submission_id=submission.id,
+                filedata=submission.data[field_id]['data'],
+            )
+            self.session.add(f)
+
+            # replace the data in the submission with a reference
+            submission.data[field_id]['data'] = '@{}'.format(f.id.hex)
+
+            # we need to mark these changes as only top-level json changes
+            # are automatically propagated
+            submission.data.changed()
+
     def remove_old_pending_submissions(self, older_than):
         """ Removes all pending submissions older than the given date. The
         date is expected to be in UTC!

diff --git a/onegov/form/fields.py b/onegov/form/fields.py
@@ -46,11 +46,16 @@ def process_formdata(self, valuelist):
             self.data = {}
 
     def process_fieldstorage(self, fs):
-        if not hasattr(fs, 'file'):
+
+        # support webob and werkzeug multidicts
+        fp = getattr(fs, 'file', getattr(fs, 'stream', None))
+
+        if fp is None:
             return {}
+        else:
+            fp.seek(0)
 
-        fs.file.seek(0)
-        file_data = fs.file.read()
+        file_data = fp.read()
 
         mimetype_by_introspection = magic.from_buffer(file_data, mime=True)
         mimetype_by_introspection = mimetype_by_introspection.decode('utf-8')

diff --git a/onegov/form/models.py b/onegov/form/models.py
@@ -75,6 +75,13 @@ class FormSubmission(Base, TimestampMixin):
         nullable=False
     )
 
+    #: the files belonging to this submission
+    files = relationship(
+        "FormSubmissionFile",
+        backref='submission',
+        cascade="all, delete-orphan"
+    )
+
     __mapper_args__ = {
         "polymorphic_on": 'state'
     }
@@ -118,3 +125,37 @@ class PendingFormSubmission(FormSubmission):
 
 class CompleteFormSubmission(FormSubmission):
     __mapper_args__ = {'polymorphic_identity': 'complete'}
+
+
+class FormSubmissionFile(Base, TimestampMixin):
+    """ Holds files uploaded in form submissions.
+
+    This ensures that forms can be loaded without having to load the files
+    into memory. But it's still not super efficient. The thinking is that
+    most forms won't have file uploads and if they do it won't be large.
+
+    Don't store big files here, or files which need to be served often.
+    For that you *have* to use some kind of filesystem storage.
+
+    The basic use case for this table is the odd table which contains some
+    kind of file which is then viewed by backend personell only.
+
+    In this case there won't many file downloads and it's important
+    that the file stays with the form and is not accidentally lost.
+
+    So it fits for that case.
+    """
+
+    __tablename__ = 'submission_files'
+
+    #: id of the file
+    id = Column(UUID, primary_key=True, default=uuid4)
+
+    #: the id of the submission
+    submission_id = Column(UUID, ForeignKey(FormSubmission.id), nullable=False)
+
+    #: the id of the field in the submission
+    field_id = Column(Text, nullable=False)
+
+    #: the actual file data
+    filedata = deferred(Column(Text, nullable=False))
diff --git a/onegov/form/parser/core.py b/onegov/form/parser/core.py
@@ -315,12 +315,10 @@
 
 
 # increasing the default filesize is *strongly discouarged*, as we are not
-# storing those files efficently yet -> they need to fit in memory
+# storing those files in the database, so they need to fit in memory
 #
-# if this value should be higher, we need to either:
-# * store the files outside the database
-# * store the files in a separate table where they are not read into memory
-#   as frequently as they are now
+# if this value must be higher, we need to store the files outside the
+# database
 #
 MEGABYTE = 1000 ** 2
 DEFAULT_UPLOAD_LIMIT = 5 * MEGABYTE

diff --git a/onegov/form/tests/test_collection.py b/onegov/form/tests/test_collection.py
@@ -1,12 +1,15 @@
 import pytest
 
 from datetime import datetime, timedelta
+from onegov.core.compat import BytesIO
 from onegov.form import FormCollection, PendingFormSubmission
+from onegov.form.models import FormSubmissionFile
 from onegov.form.errors import UnableToComplete
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm.exc import FlushError
 from textwrap import dedent
 from webob.multidict import MultiDict
+from werkzeug.datastructures import FileMultiDict
 from wtforms.csrf.core import CSRF
 
 
@@ -166,3 +169,73 @@ def test_delete_with_submissions(session):
 
     assert collection.submissions.query().count() == 0
     assert collection.definitions.query().count() == 0
+
+
+def test_file_submissions(session):
+    collection = FormCollection(session)
+
+    # upload a new file
+    definition = collection.definitions.add('File', definition="File = *.txt")
+
+    data = FileMultiDict()
+    data.add_file('file', BytesIO(b'foobar'), filename='foobar.txt')
+
+    submission = collection.submissions.add(
+        'file', definition.form_class(data), state='pending')
+
+    assert len(submission.files) == 1
+    assert len(submission.files[0].filedata) > 0
+
+    # replace the existing file
+    previous_content = submission.files[0].filedata
+    session.refresh(submission)
+
+    data = FileMultiDict()
+    data.add('file', 'replace')
+    data.add_file('file', BytesIO(b'barfoo'), filename='foobar.txt')
+
+    collection.submissions.update(submission, definition.form_class(data))
+
+    assert len(submission.files) == 1
+    assert previous_content != submission.files[0].filedata
+
+    # keep the file
+    previous_content = submission.files[0].filedata
+    session.refresh(submission)
+
+    data = FileMultiDict()
+    data.add('file', 'keep')
+    data.add_file('file', BytesIO(b''), filename='foobar.txt')
+
+    collection.submissions.update(submission, definition.form_class(data))
+
+    assert len(submission.files) == 1
+    assert previous_content == submission.files[0].filedata
+
+    # delete the file
+    session.refresh(submission)
+
+    data = FileMultiDict()
+    data.add('file', 'delete')
+    data.add_file('file', BytesIO(b'asdf'), filename='foobar.txt')
+
+    collection.submissions.update(submission, definition.form_class(data))
+    assert len(submission.files) == 0
+
+
+def test_file_submissions_cascade(session):
+
+    collection = FormCollection(session)
+
+    # upload a new file
+    definition = collection.definitions.add('File', definition="File = *.txt")
+
+    data = FileMultiDict()
+    data.add_file('file', BytesIO(b'foobar'), filename='foobar.txt')
+
+    submission = collection.submissions.add(
+        'file', definition.form_class(data), state='pending')
+
+    assert session.query(FormSubmissionFile).count() == 1
+    session.delete(submission)
+    assert session.query(FormSubmissionFile).count() == 0
diff --git a/setup.py b/setup.py
@@ -50,7 +50,8 @@ def get_long_description():
             'coverage',
             'onegov.testing',
             'pytest',
-            'webob'
+            'webob',
+            'werkzeug'
         ],
     ),
     classifiers=[