Skip to content

Commit

Permalink
OpenConceptLab/ocl_issues#412 | script to import v1 collection versio…
Browse files Browse the repository at this point in the history
…ns and ids
  • Loading branch information
snyaggarwal committed Mar 15, 2021
1 parent ccfcfc2 commit 9a872bd
Show file tree
Hide file tree
Showing 6 changed files with 362 additions and 4 deletions.
76 changes: 76 additions & 0 deletions core/importers/management/commands/import_v1_collection_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import json
import time
from pprint import pprint

from django.core.management import BaseCommand
from pydash import get

from core.collections.models import Collection


class Command(BaseCommand):
help = 'import v1 collection/version ids'

total = 0
processed = 0
created = []
existed = []
failed = []
not_found = []
start_time = None
elapsed_seconds = 0

@staticmethod
def log(msg):
print("*******{}*******".format(msg))

def handle(self, *args, **options):
self.start_time = time.time()
FILE_PATH = '/code/core/importers/v1_dump/data/exported_collection_ids.json'
lines = open(FILE_PATH, 'r').readlines()
FILE_PATH = '/code/core/importers/v1_dump/data/exported_collectionversion_ids.json'
lines += open(FILE_PATH, 'r').readlines()

self.log('STARTING COLLECTION/VERSION IDS IMPORT')
self.total = len(lines)
self.log('TOTAL: {}'.format(self.total))

for line in lines:
data = json.loads(line)
original_data = data.copy()
try:
_id = get(data.pop('_id'), '$oid')
uri = data.pop('uri')
self.processed += 1
updated = Collection.objects.filter(uri=uri).update(internal_reference_id=_id)
if updated:
self.created.append(original_data)
self.log("Updated: {} ({}/{})".format(uri, self.processed, self.total))
else:
self.not_found.append(original_data)
self.log("Not Found: {} ({}/{})".format(uri, self.processed, self.total))

except Exception as ex:
self.log("Failed: ")
self.log(ex.args)
self.failed.append({**original_data, 'errors': ex.args})

self.elapsed_seconds = time.time() - self.start_time

self.log(
"Result (in {} secs) : Total: {} | Created: {} | NotFound: {} | Failed: {}".format(
self.elapsed_seconds, self.total, len(self.created), len(self.not_found), len(self.failed)
)
)

if self.existed:
self.log("Existed")
pprint(self.existed)

if self.failed:
self.log("Failed")
pprint(self.failed)

if self.not_found:
self.log("Not Found")
pprint(self.not_found)
134 changes: 134 additions & 0 deletions core/importers/management/commands/import_v1_collection_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import json
from pprint import pprint

from django.core.management import BaseCommand
from pydash import get

from core.collections.models import Collection, CollectionReference
from core.collections.utils import is_concept
from core.concepts.documents import ConceptDocument
from core.concepts.models import Concept
from core.mappings.documents import MappingDocument
from core.mappings.models import Mapping
from core.users.models import UserProfile


class Command(BaseCommand):
help = 'import v1 collection versions'

total = 0
processed = 0
created = []
existed = []
failed = []
not_found_expressions = dict()

@staticmethod
def log(msg):
print("*******{}*******".format(msg))

def add_in_not_found_expression(self, collection_uri, expression):
if collection_uri not in self.not_found_expressions:
self.not_found_expressions[collection_uri] = []

self.not_found_expressions[collection_uri].append(expression)

def handle(self, *args, **options):
FILE_PATH = '/code/core/importers/v1_dump/data/exported_collectionversions.json'
lines = open(FILE_PATH, 'r').readlines()

self.log('STARTING COLLECTION VERSIONS IMPORT')
self.total = len(lines)
self.log('TOTAL: {}'.format(self.total))

for line in lines:
data = json.loads(line)
original_data = data.copy()
self.processed += 1
_id = data.pop('_id')
data['internal_reference_id'] = get(_id, '$oid')
for attr in [
'active_concepts', 'active_mappings', 'last_child_update', 'last_concept_update', 'last_mapping_update',
'parent_version_id', 'previous_version_id', 'versioned_object_type_id', 'concepts', 'mappings'
]:
data.pop(attr, None)

data['snapshot'] = data.pop('collection_snapshot', None)
data['external_id'] = data.pop('version_external_id', None)

versioned_object_id = data.pop('versioned_object_id')
versioned_object = Collection.objects.filter(internal_reference_id=versioned_object_id).first()
version = data.pop('mnemonic')
created_at = data.pop('created_at')
updated_at = data.pop('updated_at')
created_by = data.get('created_by')
updated_by = data.get('updated_by')
qs = UserProfile.objects.filter(username=created_by)
if qs.exists():
data['created_by'] = qs.first()
qs = UserProfile.objects.filter(username=updated_by)
if qs.exists():
data['updated_by'] = qs.first()
data['created_at'] = get(created_at, '$date')
data['updated_at'] = get(updated_at, '$date')
data['organization_id'] = versioned_object.organization_id
data['user_id'] = versioned_object.user_id
data['collection_type'] = versioned_object.collection_type
references = data.pop('references') or []

self.log("Processing: {} ({}/{})".format(version, self.processed, self.total))
uri = data['uri']
if Collection.objects.filter(uri=uri).exists():
self.existed.append(original_data)
else:
collection = Collection.objects.create(**data, version=version, mnemonic=versioned_object.mnemonic)
if collection.id:
self.created.append(original_data)
else:
self.failed.append(original_data)
continue
saved_references = []
concepts = []
mappings = []
for ref in references:
expression = ref.get('expression')
__is_concept = is_concept(expression)
concept = None
mapping = None
if __is_concept:
concept = Concept.objects.filter(uri=expression).first()
if concept:
concepts.append(concept)
else:
mapping = Mapping.objects.filter(uri=expression).first()
if mapping:
mappings.append(mapping)

if not concept and not mapping:
self.add_in_not_found_expression(uri, expression)
continue

reference = CollectionReference(expression=expression)
reference.save()
saved_references.append(reference)

collection.references.set(saved_references)
collection.concepts.set(concepts)
collection.mappings.set(mappings)
collection.batch_index(collection.concepts, ConceptDocument)
collection.batch_index(collection.mappings, MappingDocument)

self.log(
"Result: Created: {} | Existed: {} | Failed: {}".format(
len(self.created), len(self.existed), len(self.failed)
)
)
if self.existed:
self.log("Existed")
pprint(self.existed)
if self.failed:
self.log("Failed")
pprint(self.failed)
if self.not_found_expressions:
self.log('Expressions Not Added')
pprint(self.not_found_expressions)
141 changes: 141 additions & 0 deletions core/importers/management/commands/import_v1_collections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import json
from pprint import pprint

from django.core.management import BaseCommand
from pydash import get

from core.collections.models import Collection, CollectionReference
from core.collections.utils import is_concept
from core.common.constants import HEAD
from core.concepts.documents import ConceptDocument
from core.concepts.models import Concept
from core.mappings.documents import MappingDocument
from core.mappings.models import Mapping
from core.orgs.models import Organization
from core.users.models import UserProfile


class Command(BaseCommand):
help = 'import v1 collections'

total = 0
processed = 0
created = []
existed = []
failed = []
parents = dict()
users = dict()
references = dict()
not_found_expressions = dict()

@staticmethod
def log(msg):
print("*******{}*******".format(msg))

def get_parent(self, parent_id, uri):
if parent_id not in self.parents:
if '/orgs/' in uri:
result = dict(organization=Organization.objects.filter(internal_reference_id=parent_id).first())
else:
result = dict(user=UserProfile.objects.filter(internal_reference_id=parent_id).first())
self.parents[parent_id] = result

return self.parents[parent_id]

def add_in_not_found_expression(self, collection_uri, expression):
if collection_uri not in self.not_found_expressions:
self.not_found_expressions[collection_uri] = []

self.not_found_expressions[collection_uri].append(expression)

def handle(self, *args, **options):
FILE_PATH = '/code/core/importers/v1_dump/data/exported_collections.json'
lines = open(FILE_PATH, 'r').readlines()

self.log('STARTING COLLECTION IMPORT')
self.total = len(lines)
self.log('TOTAL: {}'.format(self.total))

for line in lines:
data = json.loads(line)
original_data = data.copy()
self.processed += 1
_id = data.pop('_id')
for attr in ['parent_type_id', 'concepts', 'mappings']:
data.pop(attr, None)

parent_id = data.pop('parent_id')
created_at = data.pop('created_at')
updated_at = data.pop('updated_at')
created_by = data.get('created_by')
updated_by = data.get('updated_by')
references = data.pop('references') or []
qs = UserProfile.objects.filter(username=created_by)
if qs.exists():
data['created_by'] = qs.first()
qs = UserProfile.objects.filter(username=updated_by)
if qs.exists():
data['updated_by'] = qs.first()
data['internal_reference_id'] = get(_id, '$oid')
data['created_at'] = get(created_at, '$date')
data['updated_at'] = get(updated_at, '$date')
mnemonic = data.get('mnemonic')
data = {**data, **self.get_parent(parent_id, data['uri'])}

self.log("Processing: {} ({}/{})".format(mnemonic, self.processed, self.total))
uri = data['uri']
if Collection.objects.filter(uri=uri).exists():
self.existed.append(original_data)
else:
collection = Collection.objects.create(**data, version=HEAD)
if collection.id:
self.created.append(original_data)
else:
self.failed.append(original_data)
continue
saved_references = []
concepts = []
mappings = []
for ref in references:
expression = ref.get('expression')
__is_concept = is_concept(expression)
concept = None
mapping = None
if __is_concept:
concept = Concept.objects.filter(uri=expression).first()
if concept:
concepts.append(concept)
else:
mapping = Mapping.objects.filter(uri=expression).first()
if mapping:
mappings.append(mapping)

if not concept and not mapping:
self.add_in_not_found_expression(uri, expression)
continue

reference = CollectionReference(expression=expression)
reference.save()
saved_references.append(reference)

collection.references.set(saved_references)
collection.concepts.set(concepts)
collection.mappings.set(mappings)
collection.batch_index(collection.concepts, ConceptDocument)
collection.batch_index(collection.mappings, MappingDocument)

self.log(
"Result: Created: {} | Existed: {} | Failed: {}".format(
len(self.created), len(self.existed), len(self.failed)
)
)
if self.existed:
self.log("Existed")
pprint(self.existed)
if self.failed:
self.log("Failed")
pprint(self.failed)
if self.not_found_expressions:
self.log('Expressions Not Added')
pprint(self.not_found_expressions)

4 changes: 4 additions & 0 deletions core/importers/management/commands/import_v1_concept_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,7 @@ def handle(self, *args, **options):
if self.failed:
self.log("Failed")
pprint(self.failed)

if self.not_found:
self.log("Not Found")
pprint(self.not_found)
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ def handle(self, *args, **options):
data['updated_by'] = qs.first()
data['created_at'] = get(created_at, '$date')
data['updated_at'] = get(updated_at, '$date')
data['organization'] = versioned_object.organization
data['user'] = versioned_object.user
data['organization_id'] = versioned_object.organization_id
data['user_id'] = versioned_object.user_id
data['source_type'] = versioned_object.source_type

self.log("Processing: {} ({}/{})".format(version, self.processed, self.total))
if Source.objects.filter(uri=data['uri']).exists():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env bash
mongo "localhost:27017/ocl" ./export_collections_and_versions.js
mongoexport --db ocl --collection export.collections -o ../data/exported_collections.json
mongoexport --db ocl --collection export.collectionversions -o ../data/exported_collectionversions.json
mongoexport --db ocl --collection export.collections -o exported_collections.json
mongoexport --db ocl --collection export.collection_ids -o exported_collection_ids.json
mongoexport --db ocl --collection export.collectionversion_ids -o exported_collectionversion_ids.json
mongoexport --db ocl --collection export.collectionversions -o exported_collectionversions.json

0 comments on commit 9a872bd

Please sign in to comment.