Skip to content
This repository has been archived by the owner on Sep 5, 2019. It is now read-only.

Commit

Permalink
Moves the uploaded files to their own table.
Browse files Browse the repository at this point in the history
  • Loading branch information
Denis Krienbühl committed Jun 3, 2015
1 parent 203792a commit 6b0cfd9
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 20 deletions.
3 changes: 3 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Changelog
Unreleased
~~~~~~~~~~

- Moves the uploaded files to their own table.
[href]

0.3.1 (2015-06-02)
~~~~~~~~~~~~~~~~~~~

Expand Down
87 changes: 76 additions & 11 deletions onegov/form/collection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from delorean import Delorean
from datetime import datetime, timedelta
from onegov.core.utils import normalize_for_url
from onegov.form.models import FormDefinition, FormSubmission
from sqlalchemy import inspect
from onegov.form.fields import UploadField
from onegov.form.models import (
FormDefinition,
FormSubmission,
FormSubmissionFile
)
from sqlalchemy import inspect, not_
from uuid import uuid4


class FormCollection(object):
Expand Down Expand Up @@ -94,19 +100,13 @@ def add(self, form_name, form, state):

# look up the right class depending on the type
_mapper = inspect(FormSubmission).polymorphic_map.get(state)

submission = (_mapper and _mapper.class_ or FormSubmission)()
submission.id = uuid4()
submission.name = form_name
submission.definition = form._source
submission.data = form.data
submission.state = state

# pending submissions are not necessarily valid, however we don't need
# to store invalid state as it is wiped out anyway
submission.prune(form)

# never include the csrf token
if form.meta.csrf and form.meta.csrf_field_name in submission.data:
del submission.data[form.meta.csrf_field_name]
self.update(submission, form)

self.session.add(submission)
self.session.flush()
Expand All @@ -120,6 +120,71 @@ def add(self, form_name, form, state):

return submission

def update(self, submission, form):
""" Takes a submission and a form and updates the submission data
as well as the files stored in a spearate table.
"""
assert submission.id and submission.state

submission.definition = form._source
submission.data = form.data

# pending submissions are not necessarily valid, however we don't need
# to store invalid state as it is wiped out anyway
submission.prune(form)

# never include the csrf token
if form.meta.csrf and form.meta.csrf_field_name in submission.data:
del submission.data[form.meta.csrf_field_name]

# move uploaded files to a separate table
files = set(
field_id for field_id, field in form._fields.items()
if isinstance(field, UploadField)
)

files_to_remove = set(
id for id in files
if submission.data.get(id) == {}
)

files_to_add = set(
id for id in (files - files_to_remove)
if submission.data.get(id)
and not submission.data[id]['data'].startswith('@')
)

files_to_keep = files - files_to_remove - files_to_add

# delete all files which are not part of the updated form
# if no files are given, delete all files belonging to the submission
query = self.session.query(FormSubmissionFile)
query = query.filter(FormSubmissionFile.submission_id == submission.id)

if files_to_keep:
query = query.filter(not_(
FormSubmissionFile.field_id.in_(files_to_keep)))

query.delete('fetch')

# store the new fields in the separate table
for field_id in files_to_add:
f = FormSubmissionFile(
id=uuid4(),
field_id=field_id,
submission_id=submission.id,
filedata=submission.data[field_id]['data'],
)
self.session.add(f)

# replace the data in the submission with a reference
submission.data[field_id]['data'] = '@{}'.format(f.id.hex)

# we need to mark these changes as only top-level json changes
# are automatically propagated
submission.data.changed()

def remove_old_pending_submissions(self, older_than):
""" Removes all pending submissions older than the given date. The
date is expected to be in UTC!
Expand Down
11 changes: 8 additions & 3 deletions onegov/form/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,16 @@ def process_formdata(self, valuelist):
self.data = {}

def process_fieldstorage(self, fs):
if not hasattr(fs, 'file'):

# support webob and werkzeug multidicts
fp = getattr(fs, 'file', getattr(fs, 'stream', None))

if fp is None:
return {}
else:
fp.seek(0)

fs.file.seek(0)
file_data = fs.file.read()
file_data = fp.read()

mimetype_by_introspection = magic.from_buffer(file_data, mime=True)
mimetype_by_introspection = mimetype_by_introspection.decode('utf-8')
Expand Down
41 changes: 41 additions & 0 deletions onegov/form/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ class FormSubmission(Base, TimestampMixin):
nullable=False
)

#: the files belonging to this submission
files = relationship(
"FormSubmissionFile",
backref='submission',
cascade="all, delete-orphan"
)

__mapper_args__ = {
"polymorphic_on": 'state'
}
Expand Down Expand Up @@ -118,3 +125,37 @@ class PendingFormSubmission(FormSubmission):

class CompleteFormSubmission(FormSubmission):
__mapper_args__ = {'polymorphic_identity': 'complete'}


class FormSubmissionFile(Base, TimestampMixin):
""" Holds files uploaded in form submissions.
This ensures that forms can be loaded without having to load the files
into memory. But it's still not super efficient. The thinking is that
most forms won't have file uploads and if they do it won't be large.
Don't store big files here, or files which need to be served often.
For that you *have* to use some kind of filesystem storage.
The basic use case for this table is the odd table which contains some
kind of file which is then viewed by backend personell only.
In this case there won't many file downloads and it's important
that the file stays with the form and is not accidentally lost.
So it fits for that case.
"""

__tablename__ = 'submission_files'

#: id of the file
id = Column(UUID, primary_key=True, default=uuid4)

#: the id of the submission
submission_id = Column(UUID, ForeignKey(FormSubmission.id), nullable=False)

#: the id of the field in the submission
field_id = Column(Text, nullable=False)

#: the actual file data
filedata = deferred(Column(Text, nullable=False))
8 changes: 3 additions & 5 deletions onegov/form/parser/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,12 +315,10 @@


# increasing the default filesize is *strongly discouarged*, as we are not
# storing those files efficently yet -> they need to fit in memory
# storing those files in the database, so they need to fit in memory
#
# if this value should be higher, we need to either:
# * store the files outside the database
# * store the files in a separate table where they are not read into memory
# as frequently as they are now
# if this value must be higher, we need to store the files outside the
# database
#
MEGABYTE = 1000 ** 2
DEFAULT_UPLOAD_LIMIT = 5 * MEGABYTE
Expand Down
73 changes: 73 additions & 0 deletions onegov/form/tests/test_collection.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import pytest

from datetime import datetime, timedelta
from onegov.core.compat import BytesIO
from onegov.form import FormCollection, PendingFormSubmission
from onegov.form.models import FormSubmissionFile
from onegov.form.errors import UnableToComplete
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm.exc import FlushError
from textwrap import dedent
from webob.multidict import MultiDict
from werkzeug.datastructures import FileMultiDict
from wtforms.csrf.core import CSRF


Expand Down Expand Up @@ -166,3 +169,73 @@ def test_delete_with_submissions(session):

assert collection.submissions.query().count() == 0
assert collection.definitions.query().count() == 0


def test_file_submissions(session):
collection = FormCollection(session)

# upload a new file
definition = collection.definitions.add('File', definition="File = *.txt")

data = FileMultiDict()
data.add_file('file', BytesIO(b'foobar'), filename='foobar.txt')

submission = collection.submissions.add(
'file', definition.form_class(data), state='pending')

assert len(submission.files) == 1
assert len(submission.files[0].filedata) > 0

# replace the existing file
previous_content = submission.files[0].filedata
session.refresh(submission)

data = FileMultiDict()
data.add('file', 'replace')
data.add_file('file', BytesIO(b'barfoo'), filename='foobar.txt')

collection.submissions.update(submission, definition.form_class(data))

assert len(submission.files) == 1
assert previous_content != submission.files[0].filedata

# keep the file
previous_content = submission.files[0].filedata
session.refresh(submission)

data = FileMultiDict()
data.add('file', 'keep')
data.add_file('file', BytesIO(b''), filename='foobar.txt')

collection.submissions.update(submission, definition.form_class(data))

assert len(submission.files) == 1
assert previous_content == submission.files[0].filedata

# delete the file
session.refresh(submission)

data = FileMultiDict()
data.add('file', 'delete')
data.add_file('file', BytesIO(b'asdf'), filename='foobar.txt')

collection.submissions.update(submission, definition.form_class(data))
assert len(submission.files) == 0


def test_file_submissions_cascade(session):

collection = FormCollection(session)

# upload a new file
definition = collection.definitions.add('File', definition="File = *.txt")

data = FileMultiDict()
data.add_file('file', BytesIO(b'foobar'), filename='foobar.txt')

submission = collection.submissions.add(
'file', definition.form_class(data), state='pending')

assert session.query(FormSubmissionFile).count() == 1
session.delete(submission)
assert session.query(FormSubmissionFile).count() == 0
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def get_long_description():
'coverage',
'onegov.testing',
'pytest',
'webob'
'webob',
'werkzeug'
],
),
classifiers=[
Expand Down

0 comments on commit 6b0cfd9

Please sign in to comment.