Skip to content

Commit

Permalink
OpenConceptLab/ocl_issues#910 | Export queries | limit/offset on look…
Browse files Browse the repository at this point in the history
…up table only
  • Loading branch information
snyaggarwal committed Aug 13, 2021
1 parent de1e009 commit 9c12c81
Showing 1 changed file with 53 additions and 53 deletions.
106 changes: 53 additions & 53 deletions core/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def write_export_file(
resource_string = json.dumps(data, cls=encoders.JSONEncoder)
logger.info('Done serializing attributes.')

batch_size = 100
batch_size = 2

This comment has been minimized.

Copy link
@rkorytkowski

rkorytkowski Aug 13, 2021

Contributor

Is this a typo? Did you mean 200?

This comment has been minimized.

Copy link
@snyaggarwal

snyaggarwal Aug 13, 2021

Author Contributor

bad typo!

is_collection = resource_type == 'collection'

if is_collection:
Expand All @@ -216,51 +216,44 @@ def write_export_file(
concepts_qs = Concept.sources.through.objects.filter(source_id=version.id)
mappings_qs = Mapping.sources.through.objects.filter(source_id=version.id)

all_concepts = Concept.objects.filter(id__in=concepts_qs.values('concept_id'))
all_mappings = Mapping.objects.filter(id__in=mappings_qs.values('mapping_id'))

filters = dict()

if not is_collection:
filters['is_active'] = True
if version.is_head:
filters['is_latest_version'] = True

if filters:
all_concepts = all_concepts.filter(**filters)
all_mappings = all_mappings.filter(**filters)

total_concepts = all_concepts.count()
total_mappings = all_mappings.count()

def get_batch(queryset, offset, limit):
_queryset = queryset.order_by('-id')
if filters:
_queryset = _queryset.filter(**filters)
return _queryset[offset:limit]

with open('export.json', 'w') as out:
out.write('%s, "concepts": [' % resource_string[:-1])

resource_name = resource_type.title()

if total_concepts:
logger.info(
'%s has %d concepts. Getting them in batches of %d...' % (resource_name, total_concepts, batch_size)
)
if concepts_qs.exists():
logger.info('%s has concepts. Getting them in batches of %d...' % (resource_name, batch_size))
concept_serializer_class = get_class('core.concepts.serializers.ConceptVersionExportSerializer')
for start in range(0, total_concepts, batch_size):
end = min(start + batch_size, total_concepts)
logger.info('Serializing concepts %d - %d...' % (start+1, end))
concept_versions = get_batch(all_concepts, start, end).prefetch_related('names', 'descriptions')
concept_serializer = concept_serializer_class(concept_versions, many=True)
concept_data = concept_serializer.data
concept_string = json.dumps(concept_data, cls=encoders.JSONEncoder)
concept_string = concept_string[1:-1]
with open('export.json', 'a') as out:
out.write(concept_string)
if end != total_concepts:
out.write(', ')
start = 0
end = batch_size
batch_queryset = concepts_qs.order_by('-concept_id')[start:end]

while batch_queryset.exists():
logger.info('Serializing concepts %d - %d...' % (start + 1, end))
queryset = Concept.objects.filter(id__in=batch_queryset.values_list('concept_id')).filter(**filters)

This comment has been minimized.

Copy link
@rkorytkowski

rkorytkowski Aug 13, 2021

Contributor

Do we care about order in the output file? It's not expensive to order it here as well...

This comment has been minimized.

Copy link
@snyaggarwal

snyaggarwal Aug 13, 2021

Author Contributor

don't think we care about the order, I can still add order_by('-id')

if queryset.exists():
if start > 0:
with open('export.json', 'a') as out:
out.write(', ')
concept_versions = queryset.prefetch_related('names', 'descriptions')
data = concept_serializer_class(concept_versions, many=True).data
concept_string = json.dumps(data, cls=encoders.JSONEncoder)
concept_string = concept_string[1:-1]

with open('export.json', 'a') as out:
out.write(concept_string)

start += batch_size
end += batch_size
batch_queryset = concepts_qs.order_by('-concept_id')[start:end]

logger.info('Done serializing concepts.')
else:
logger.info('%s has no concepts to serialize.' % resource_name)
Expand Down Expand Up @@ -295,27 +288,34 @@ def get_batch(queryset, offset, limit):
with open('export.json', 'a') as out:
out.write('], "mappings": [')

if total_mappings:
logger.info(
'%s has %d mappings. Getting them in batches of %d...' % (resource_name, total_mappings, batch_size)
)
if mappings_qs.exists():
logger.info('%s has mappings. Getting them in batches of %d...' % (resource_name, batch_size))
mapping_serializer_class = get_class('core.mappings.serializers.MappingDetailSerializer')
for start in range(0, total_mappings, batch_size):
end = min(start + batch_size, total_mappings)
logger.info('Serializing mappings %d - %d...' % (start+1, end))
mappings = get_batch(all_mappings, start, end).select_related(
'from_concept', 'to_concept',
'from_source__organization', 'from_source__user',
'to_source__organization', 'to_source__user'
)
reference_serializer = mapping_serializer_class(mappings, many=True)
reference_data = reference_serializer.data
reference_string = json.dumps(reference_data, cls=encoders.JSONEncoder)
reference_string = reference_string[1:-1]
with open('export.json', 'a') as out:
out.write(reference_string)
if end != total_mappings:
out.write(', ')
start = 0
end = batch_size
batch_queryset = mappings_qs.order_by('-mapping_id')[start:end]

while batch_queryset.exists():
logger.info('Serializing mappings %d - %d...' % (start + 1, start + batch_size))
queryset = Mapping.objects.filter(id__in=batch_queryset.values_list('mapping_id')).filter(**filters)

This comment has been minimized.

Copy link
@rkorytkowski

rkorytkowski Aug 13, 2021

Contributor

"order by" here as well?

if queryset.exists():
if start > 0:
with open('export.json', 'a') as out:
out.write(', ')

mapping_versions = queryset.select_related(
'from_concept', 'to_concept', 'from_source__organization', 'from_source__user',
'to_source__organization', 'to_source__user')
data = mapping_serializer_class(mapping_versions, many=True).data
mapping_string = json.dumps(data, cls=encoders.JSONEncoder)
mapping_string = mapping_string[1:-1]
with open('export.json', 'a') as out:
out.write(mapping_string)

start += batch_size
end += batch_size
batch_queryset = mappings_qs.order_by('-mapping_id')[start:end]

This comment has been minimized.

Copy link
@rkorytkowski

rkorytkowski Aug 13, 2021

Contributor

One more point here. I don't know exact inner workings, but we may want to recreate both query sets on each batch fetch in order to free up cache. See https://docs.djangoproject.com/en/1.8/topics/db/queries/#caching-and-querysets

I assume that slicing creates a new query set, but it clones cache along. Is that right? Could you please verify?

This comment has been minimized.

Copy link
@rkorytkowski

rkorytkowski Aug 13, 2021

Contributor

Actually I found this sentence: Specifically, this means that limiting the queryset using an array slice or an index will not populate the cache. so we are safe here.


logger.info('Done serializing mappings.')
else:
logger.info('%s has no mappings to serialize.' % resource_name)
Expand Down

0 comments on commit 9c12c81

Please sign in to comment.