Merge pull request #3107 from SEED-platform/3029-media-volume-data

management task to cleanup deleted uploaded
SEED-platform · Feb 2, 2022 · 10ead69 · 10ead69
2 parents 2643198 + 0b70716
commit 10ead69
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 152 deletions.
diff --git a/seed/data_importer/managers.py b/seed/data_importer/managers.py
@@ -12,3 +12,8 @@ class NotDeletedManager(models.Manager):
 
     def get_queryset(self, *args, **kwargs):
         return super().get_queryset(*args, **kwargs).exclude(deleted=True)
+
+    def get_all(self, *args, **kwargs):
+        """Method to return ALL ImportFiles, including the ones where `deleted == True` which are normally excluded.
+        This is used for database/filesystem cleanup."""
+        return super().get_queryset(*args, **kwargs)
diff --git a/seed/data_importer/migrations/0016_auto_20220119_1347.py b/seed/data_importer/migrations/0016_auto_20220119_1347.py
@@ -0,0 +1,29 @@
+# Generated by Django 3.2.7 on 2022-01-19 21:47
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('data_importer', '0015_auto_20210712_2134'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='importfile',
+            name='export_file',
+        ),
+        migrations.RemoveField(
+            model_name='importrecord',
+            name='import_completed_at',
+        ),
+        migrations.RemoveField(
+            model_name='importrecord',
+            name='mcm_version',
+        ),
+        migrations.RemoveField(
+            model_name='importrecord',
+            name='merge_completed_at',
+        ),
+    ]
diff --git a/seed/data_importer/models.py b/seed/data_importer/models.py
@@ -13,12 +13,9 @@
 
 from urllib.parse import unquote
 
-from django.contrib.auth.models import User
 from django.urls import reverse
 from django.db import models
 from django.db.models import Q
-from django.utils import timezone
-from django.utils.timesince import timesince
 from django_extensions.db.models import TimeStampedModel
 
 from config.utils import de_camel_case
@@ -111,16 +108,10 @@ class ImportRecord(NotDeletableModel):
     is_imported_live = models.BooleanField(default=False)
     keep_missing_buildings = models.BooleanField(default=True)
     status = models.IntegerField(default=0, choices=IMPORT_STATUSES)
-    import_completed_at = models.DateTimeField(blank=True, null=True)
-    merge_completed_at = models.DateTimeField(blank=True, null=True)
-    mcm_version = models.IntegerField(blank=True, null=True)
     super_organization = models.ForeignKey(
         SuperOrganization, on_delete=models.CASCADE, blank=True, null=True, related_name='import_records'
     )
 
-    # destination_taxonomy = models.ForeignKey('lin.Taxonomy', blank=True, null=True)
-    # source_taxonomy = models.ForeignKey('lin.Taxonomy', blank=True, null=True)
-
     def __str__(self):
         return 'ImportRecord %s: %s, started at %s' % (self.pk, self.name, self.start_time)
 
@@ -506,7 +497,6 @@ def mark_merged(self):
         self.merge_analysis_done = True
         self.merge_analysis_active = False
         self.is_imported_live = True
-        self.import_completed_at = timezone.now()
         self.save()
 
     def mark_merge_started(self):
@@ -534,112 +524,6 @@ def SUMMARY_ANALYSIS_ACTIVE_KEY(cls, pk):
     def SUMMARY_ANALYSIS_QUEUED_KEY(cls, pk):
         return 'SUMMARY_ANALYSIS_QUEUED%s' % pk
 
-    def prefixed_pk(self, pk, max_len_before_prefix=(SOURCE_FACILITY_ID_MAX_LEN - len('IMP1234-'))):
-        """This is a total hack to support prefixing until source_facility_id
-        is turned into a proper pk.  Prefixes a given pk with the import_record"""
-        if len('%s' % pk) > max_len_before_prefix:
-            m = hashlib.md5()
-            m.update(pk)
-            digest = m.hexdigest()
-            # TODO: precompute this if condition based on md5 alg and SFID MAX LEN
-            if len(digest) > max_len_before_prefix:
-                digest = digest[:max_len_before_prefix]
-            transformed_pk = digest
-        else:
-            transformed_pk = pk
-        return 'IMP%s-%s' % (self.pk, transformed_pk)
-
-    @property
-    def to_json(self):
-        try:
-            last_modified_by = ''
-            try:
-                if self.last_modified_by:
-                    last_modified_by = self.last_modified_by.email or ''
-            except User.DoesNotExist:
-                pass
-            return json.dumps({
-                'name': self.name,
-                'app': self.app,
-                'last_modified_time_ago': timesince(self.updated_at).split(',')[0],
-                'last_modified_seconds_ago': -1 * (
-                    self.updated_at - timezone.now()).total_seconds(),
-                'last_modified_by': last_modified_by,
-                'notes': self.notes,
-                'merge_analysis_done': self.merge_analysis_done,
-                'merge_analysis_active': self.merge_analysis_active,
-                'merge_analysis_queued': self.merge_analysis_queued,
-                'premerge_analysis_done': self.premerge_analysis_done,
-                'premerge_analysis_active': self.premerge_analysis_active,
-                'premerge_analysis_queued': self.premerge_analysis_queued,
-                'matching_active': self.matching_active,
-                'matching_done': self.matching_done,
-                'is_imported_live': self.is_imported_live,
-                'num_files': self.num_files,
-                'keep_missing_buildings': self.keep_missing_buildings,
-                'dashboard_url': self.dashboard_url,
-                'delete_url': self.delete_url,
-                'search_url': self.search_url,
-                'status_url': self.status_url,
-                'display_as_in_progress': self.display_as_in_progress,
-                'worksheet_url': self.worksheet_url,
-                'is_not_in_progress': self.is_not_in_progress,
-                'save_import_meta_url': self.save_import_meta_url,
-                'percent_files_ready_to_merge': self.percent_files_ready_to_merge,
-                'status': self.status,
-                'status_text': self.IMPORT_STATUSES[self.status][1],
-                'status_percent': round(self.status_percent, 0),
-                'status_numerator': self.status_numerator,
-                'status_denominator': self.status_denominator,
-                'status_is_live': self.status_is_live,
-                'is_mapping_or_cleaning': self.is_mapping_or_cleaning,
-                'num_buildings_imported_total': self.num_buildings_imported_total,
-            })
-        except BaseException:
-            from traceback import print_exc
-            print_exc()
-            return {}
-
-    # TODO #239: This is not used. Should we enable it again, why?
-    @property
-    def worksheet_progress_json(self):
-        progresses = []
-        some_file_has_mapping_active = not get_cache_state(self.MAPPING_ACTIVE_KEY, False)
-        try:
-            for f in self.files:
-                progresses.append({
-                    'pk': f.pk,
-                    'filename': f.filename_only,
-                    'delete_url': reverse('%s:delete_file' % self.app_namespace, args=(f.pk,)),
-                    'mapping_url': reverse('%s:mapping' % self.app_namespace, args=(f.pk,)),
-                    'cleaning_url': reverse('%s:cleaning' % self.app_namespace, args=(f.pk,)),
-                    'matching_url': reverse('%s:matching' % self.app_namespace, args=(f.pk,)),
-                    'num_columns': f.num_columns,
-                    'num_rows': f.num_rows,
-                    'num_mapping_complete': f.num_mapping_complete,
-                    'num_mapping_total': f.num_mapping_total,
-                    'num_mapping_remaining': f.num_mapping_remaining,
-                    'mapping_active': f.mapping_active,
-                    'some_file_has_mapping_active': some_file_has_mapping_active,
-                    'coercion_mapping_active': f.coercion_mapping_active,
-                    'cleaning_progress_pct': round(f.cleaning_progress_pct, 1),
-                    'num_cleaning_remaining': f.num_cleaning_remaining,
-                    'num_cleaning_complete': f.num_cleaning_complete,
-                    'num_cleaning_total': f.num_cleaning_total,
-                    'export_ready': f.export_ready,
-                    'export_generation_pct_complete': int(round(f.export_generation_pct_complete)),
-                    'export_url': f.export_url,
-                    'worksheet_url': self.worksheet_url,
-                    'generate_url': f.generate_url,
-                    'premerge_progress_url': f.premerge_progress_url,
-                    'merge_progress_url': f.merge_progress_url,
-                    'force_restart_cleaning_url': f.force_restart_cleaning_url,
-                })
-        except BaseException:
-            from traceback import print_exc
-            print_exc()
-        return json.dumps(progresses)
-
 
 class ImportFile(NotDeletableModel, TimeStampedModel):
     import_record = models.ForeignKey(ImportRecord, on_delete=models.CASCADE)
@@ -651,9 +535,7 @@ class ImportFile(NotDeletableModel, TimeStampedModel):
     # extension.
     uploaded_filename = models.CharField(blank=True, max_length=255)
     file_size_in_bytes = models.IntegerField(blank=True, null=True)
-    export_file = models.FileField(
-        upload_to='data_imports/exports', blank=True, null=True
-    )
+
     cached_first_row = models.TextField(blank=True, null=True)
     # Save a list of the final column mapping names that were used for this file.
     # This should really be a many-to-many with the column/ColumnMapping table.
@@ -681,9 +563,9 @@ class ImportFile(NotDeletableModel, TimeStampedModel):
     raw_save_completion = models.IntegerField(blank=True, null=True)
     source_type = models.CharField(null=True, blank=True, max_length=63)
     # program names should match a value in common.mapper.Programs
-    source_program = models.CharField(blank=True, max_length=80)  # don't think that this is used
+    source_program = models.CharField(blank=True, max_length=80)
     # program version is in format 'x.y[.z]'
-    source_program_version = models.CharField(blank=True, max_length=40)  # don't think this is used
+    source_program_version = models.CharField(blank=True, max_length=40)
     # Used by the BuildingSync import flow to link property states to file names (necessary for zip files)
     raw_property_state_to_filename = models.JSONField(default=dict, blank=True)
 
@@ -1010,37 +892,6 @@ def coercion_mapping_queued(self):
     def SAVE_COUNTER_CACHE_KEY(self):
         return 'SAVE_COUNTER_KEY%s' % self.pk
 
-    @property
-    def EXPORT_READY_CACHE_KEY(self):
-        return 'EXPORT_READY%s' % self.pk
-
-    @property
-    def EXPORT_PCT_COMPLETE_CACHE_KEY(self):
-        return 'EXPORT_PCT_COMPLETE%s' % self.pk
-
-    @property
-    def EXPORT_QUEUED_CACHE_KEY(self):
-        return 'EXPORT_QUEUED%s' % self.pk
-
-    @property
-    def export_ready(self):
-        return get_cache_state(self.EXPORT_READY_CACHE_KEY,
-                               True) and self.export_file is not None and self.export_file != ''
-
-    @property
-    def export_generation_pct_complete(self):
-        return get_cache_state(self.EXPORT_PCT_COMPLETE_CACHE_KEY, False)
-
-    @property
-    def export_url(self):
-        ns = self.import_record.app_namespace
-        return reverse('%s:download_export' % ns, args=(self.pk,))
-
-    @property
-    def generate_url(self):
-        ns = self.import_record.app_namespace
-        return reverse('%s:prepare_export' % ns, args=(self.pk,))
-
     @property
     def merge_progress_url(self):
         return reverse('data_importer:merge_progress', args=(self.pk,))

diff --git a/seed/management/commands/delete_unused_uploaded_files.py b/seed/management/commands/delete_unused_uploaded_files.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+"""
+:copyright (c) 2014 - 2022, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Department of Energy) and contributors. All rights reserved.  # NOQA
+:author
+"""
+from __future__ import unicode_literals
+
+from django.core.management.base import BaseCommand
+
+from seed.data_importer.models import ImportFile
+
+import os
+
+
+class Command(BaseCommand):
+    help = 'Creates a default super user for the system tied to an organization'
+
+    def add_arguments(self, parser):
+        parser.add_argument('--org_id',
+                            default=None,
+                            help='Name of a specific organization to operate',
+                            action='store',
+                            dest='org_id')
+
+    def handle(self, *args, **options):
+        # In local dev and production the files are in the media/uploads folder.
+        #
+        # Steps to process
+        #   1. The database has various paths of all the files as we have been
+        #      moving the files around and the database paths have not been updated
+        #      First task is to update the file paths to /seed/media/uploads for the files
+        #   2. There is a deleted column in SEED that is updated when a user removes the
+        #      file from the frontend; however, SEED persists the file. This tasks
+        #      deletes the files from disk (if they exist), but it does not delete the
+        #      ImportFile record itself
+
+        org_id = options['org_id']
+        if org_id:
+            files = ImportFile.objects.get_all().filter(import_record__super_organization=org_id)  # this actually returns a queryset
+        else:
+            files = ImportFile.objects.get_all()
+
+        # fix the file path if it is not /seed/media/uploads
+        # populate the list of files - this may need to be broken up into mulitple tasks on the real data
+        # based on how large the directory / table is.
+        rename_files = []
+        for f in files:
+            filename = f.file.name
+            if os.path.exists(filename):
+                # don't do anything if the path exists
+                continue
+
+            # the filename/path is not correct and needs to be updated
+            # put it in a tuple of db object, oldname, new name
+            new_base_path = "/seed/media/uploads"
+            if filename.startswith(new_base_path):
+                # don't do anything, file name is in the right format for
+                # docker mounted media
+                continue
+            elif filename == "":
+                # no file attached
+                continue
+            else:
+                if 'pm_imports/' in filename:
+                    # this is a special folder that needs to persist in the uploads directory
+                    rename_files.append((f, filename, f"{new_base_path}/pm_imports/{os.path.basename(filename)}"))
+                else:
+                    rename_files.append((f, filename, f"{new_base_path}/{os.path.basename(filename)}"))
+
+        self.stdout.write('********    LIST OF IMPORT FILE PATHS TO RENAME  *********')
+        for f in rename_files:
+            print(f"Will rename {f[1]} to {f[2]}")
+        self.stdout.write('********    END OF LIST (list may be blank)    *********')
+        f = input("Are you sure you want to rename all of the files above? Use with caution! [Y/y]? ")
+        if f.lower() == 'y':
+            for f in rename_files:
+                f_db = f[0]
+                print(f"Renaming {f[1]} to {f[2]}")
+                f_db.file.name = f[2]
+                f_db.save()
+            self.stdout.write('Done renaming', ending='\n')
+        else:
+            self.stdout.write('Not renaming, will not continue, exiting')
+            exit()
+
+        # now go through and find the deleted=True and remove the records
+        if org_id:
+            files = ImportFile.objects.get_all().filter(deleted=True, import_record__super_organization=org_id).exclude(file__exact='')
+        else:
+            files = ImportFile.objects.get_all().filter(deleted=True).exclude(file__exact='')
+
+        f = input(f"Are you sure you want to delete {len(files)} InputFiles that have been marked with 'deleted'? Use with caution! [Y/y]? ")
+        if f.lower() == 'y':
+            for fil in files:
+                filename = fil.file.name
+                self.stdout.write(f"Deleting file {filename}")
+                # regardless if the file exists or not,
+                fil.file.delete(save=True)
+
+            self.stdout.write('Done deleting flagged record files, the records still exist', ending='\n')
+        else:
+            self.stdout.write('Not deleting, exiting')
+            exit()