Skip to content

Commit

Permalink
Merge pull request #3107 from SEED-platform/3029-media-volume-data
Browse files Browse the repository at this point in the history
management task to cleanup deleted uploaded
  • Loading branch information
macintoshpie committed Feb 2, 2022
2 parents 2643198 + 0b70716 commit 10ead69
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 152 deletions.
5 changes: 5 additions & 0 deletions seed/data_importer/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,8 @@ class NotDeletedManager(models.Manager):

def get_queryset(self, *args, **kwargs):
return super().get_queryset(*args, **kwargs).exclude(deleted=True)

def get_all(self, *args, **kwargs):
"""Method to return ALL ImportFiles, including the ones where `deleted == True` which are normally excluded.
This is used for database/filesystem cleanup."""
return super().get_queryset(*args, **kwargs)
29 changes: 29 additions & 0 deletions seed/data_importer/migrations/0016_auto_20220119_1347.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by Django 3.2.7 on 2022-01-19 21:47

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('data_importer', '0015_auto_20210712_2134'),
]

operations = [
migrations.RemoveField(
model_name='importfile',
name='export_file',
),
migrations.RemoveField(
model_name='importrecord',
name='import_completed_at',
),
migrations.RemoveField(
model_name='importrecord',
name='mcm_version',
),
migrations.RemoveField(
model_name='importrecord',
name='merge_completed_at',
),
]
155 changes: 3 additions & 152 deletions seed/data_importer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,9 @@

from urllib.parse import unquote

from django.contrib.auth.models import User
from django.urls import reverse
from django.db import models
from django.db.models import Q
from django.utils import timezone
from django.utils.timesince import timesince
from django_extensions.db.models import TimeStampedModel

from config.utils import de_camel_case
Expand Down Expand Up @@ -111,16 +108,10 @@ class ImportRecord(NotDeletableModel):
is_imported_live = models.BooleanField(default=False)
keep_missing_buildings = models.BooleanField(default=True)
status = models.IntegerField(default=0, choices=IMPORT_STATUSES)
import_completed_at = models.DateTimeField(blank=True, null=True)
merge_completed_at = models.DateTimeField(blank=True, null=True)
mcm_version = models.IntegerField(blank=True, null=True)
super_organization = models.ForeignKey(
SuperOrganization, on_delete=models.CASCADE, blank=True, null=True, related_name='import_records'
)

# destination_taxonomy = models.ForeignKey('lin.Taxonomy', blank=True, null=True)
# source_taxonomy = models.ForeignKey('lin.Taxonomy', blank=True, null=True)

def __str__(self):
return 'ImportRecord %s: %s, started at %s' % (self.pk, self.name, self.start_time)

Expand Down Expand Up @@ -506,7 +497,6 @@ def mark_merged(self):
self.merge_analysis_done = True
self.merge_analysis_active = False
self.is_imported_live = True
self.import_completed_at = timezone.now()
self.save()

def mark_merge_started(self):
Expand Down Expand Up @@ -534,112 +524,6 @@ def SUMMARY_ANALYSIS_ACTIVE_KEY(cls, pk):
def SUMMARY_ANALYSIS_QUEUED_KEY(cls, pk):
return 'SUMMARY_ANALYSIS_QUEUED%s' % pk

def prefixed_pk(self, pk, max_len_before_prefix=(SOURCE_FACILITY_ID_MAX_LEN - len('IMP1234-'))):
"""This is a total hack to support prefixing until source_facility_id
is turned into a proper pk. Prefixes a given pk with the import_record"""
if len('%s' % pk) > max_len_before_prefix:
m = hashlib.md5()
m.update(pk)
digest = m.hexdigest()
# TODO: precompute this if condition based on md5 alg and SFID MAX LEN
if len(digest) > max_len_before_prefix:
digest = digest[:max_len_before_prefix]
transformed_pk = digest
else:
transformed_pk = pk
return 'IMP%s-%s' % (self.pk, transformed_pk)

@property
def to_json(self):
try:
last_modified_by = ''
try:
if self.last_modified_by:
last_modified_by = self.last_modified_by.email or ''
except User.DoesNotExist:
pass
return json.dumps({
'name': self.name,
'app': self.app,
'last_modified_time_ago': timesince(self.updated_at).split(',')[0],
'last_modified_seconds_ago': -1 * (
self.updated_at - timezone.now()).total_seconds(),
'last_modified_by': last_modified_by,
'notes': self.notes,
'merge_analysis_done': self.merge_analysis_done,
'merge_analysis_active': self.merge_analysis_active,
'merge_analysis_queued': self.merge_analysis_queued,
'premerge_analysis_done': self.premerge_analysis_done,
'premerge_analysis_active': self.premerge_analysis_active,
'premerge_analysis_queued': self.premerge_analysis_queued,
'matching_active': self.matching_active,
'matching_done': self.matching_done,
'is_imported_live': self.is_imported_live,
'num_files': self.num_files,
'keep_missing_buildings': self.keep_missing_buildings,
'dashboard_url': self.dashboard_url,
'delete_url': self.delete_url,
'search_url': self.search_url,
'status_url': self.status_url,
'display_as_in_progress': self.display_as_in_progress,
'worksheet_url': self.worksheet_url,
'is_not_in_progress': self.is_not_in_progress,
'save_import_meta_url': self.save_import_meta_url,
'percent_files_ready_to_merge': self.percent_files_ready_to_merge,
'status': self.status,
'status_text': self.IMPORT_STATUSES[self.status][1],
'status_percent': round(self.status_percent, 0),
'status_numerator': self.status_numerator,
'status_denominator': self.status_denominator,
'status_is_live': self.status_is_live,
'is_mapping_or_cleaning': self.is_mapping_or_cleaning,
'num_buildings_imported_total': self.num_buildings_imported_total,
})
except BaseException:
from traceback import print_exc
print_exc()
return {}

# TODO #239: This is not used. Should we enable it again, why?
@property
def worksheet_progress_json(self):
progresses = []
some_file_has_mapping_active = not get_cache_state(self.MAPPING_ACTIVE_KEY, False)
try:
for f in self.files:
progresses.append({
'pk': f.pk,
'filename': f.filename_only,
'delete_url': reverse('%s:delete_file' % self.app_namespace, args=(f.pk,)),
'mapping_url': reverse('%s:mapping' % self.app_namespace, args=(f.pk,)),
'cleaning_url': reverse('%s:cleaning' % self.app_namespace, args=(f.pk,)),
'matching_url': reverse('%s:matching' % self.app_namespace, args=(f.pk,)),
'num_columns': f.num_columns,
'num_rows': f.num_rows,
'num_mapping_complete': f.num_mapping_complete,
'num_mapping_total': f.num_mapping_total,
'num_mapping_remaining': f.num_mapping_remaining,
'mapping_active': f.mapping_active,
'some_file_has_mapping_active': some_file_has_mapping_active,
'coercion_mapping_active': f.coercion_mapping_active,
'cleaning_progress_pct': round(f.cleaning_progress_pct, 1),
'num_cleaning_remaining': f.num_cleaning_remaining,
'num_cleaning_complete': f.num_cleaning_complete,
'num_cleaning_total': f.num_cleaning_total,
'export_ready': f.export_ready,
'export_generation_pct_complete': int(round(f.export_generation_pct_complete)),
'export_url': f.export_url,
'worksheet_url': self.worksheet_url,
'generate_url': f.generate_url,
'premerge_progress_url': f.premerge_progress_url,
'merge_progress_url': f.merge_progress_url,
'force_restart_cleaning_url': f.force_restart_cleaning_url,
})
except BaseException:
from traceback import print_exc
print_exc()
return json.dumps(progresses)


class ImportFile(NotDeletableModel, TimeStampedModel):
import_record = models.ForeignKey(ImportRecord, on_delete=models.CASCADE)
Expand All @@ -651,9 +535,7 @@ class ImportFile(NotDeletableModel, TimeStampedModel):
# extension.
uploaded_filename = models.CharField(blank=True, max_length=255)
file_size_in_bytes = models.IntegerField(blank=True, null=True)
export_file = models.FileField(
upload_to='data_imports/exports', blank=True, null=True
)

cached_first_row = models.TextField(blank=True, null=True)
# Save a list of the final column mapping names that were used for this file.
# This should really be a many-to-many with the column/ColumnMapping table.
Expand Down Expand Up @@ -681,9 +563,9 @@ class ImportFile(NotDeletableModel, TimeStampedModel):
raw_save_completion = models.IntegerField(blank=True, null=True)
source_type = models.CharField(null=True, blank=True, max_length=63)
# program names should match a value in common.mapper.Programs
source_program = models.CharField(blank=True, max_length=80) # don't think that this is used
source_program = models.CharField(blank=True, max_length=80)
# program version is in format 'x.y[.z]'
source_program_version = models.CharField(blank=True, max_length=40) # don't think this is used
source_program_version = models.CharField(blank=True, max_length=40)
# Used by the BuildingSync import flow to link property states to file names (necessary for zip files)
raw_property_state_to_filename = models.JSONField(default=dict, blank=True)

Expand Down Expand Up @@ -1010,37 +892,6 @@ def coercion_mapping_queued(self):
def SAVE_COUNTER_CACHE_KEY(self):
return 'SAVE_COUNTER_KEY%s' % self.pk

@property
def EXPORT_READY_CACHE_KEY(self):
return 'EXPORT_READY%s' % self.pk

@property
def EXPORT_PCT_COMPLETE_CACHE_KEY(self):
return 'EXPORT_PCT_COMPLETE%s' % self.pk

@property
def EXPORT_QUEUED_CACHE_KEY(self):
return 'EXPORT_QUEUED%s' % self.pk

@property
def export_ready(self):
return get_cache_state(self.EXPORT_READY_CACHE_KEY,
True) and self.export_file is not None and self.export_file != ''

@property
def export_generation_pct_complete(self):
return get_cache_state(self.EXPORT_PCT_COMPLETE_CACHE_KEY, False)

@property
def export_url(self):
ns = self.import_record.app_namespace
return reverse('%s:download_export' % ns, args=(self.pk,))

@property
def generate_url(self):
ns = self.import_record.app_namespace
return reverse('%s:prepare_export' % ns, args=(self.pk,))

@property
def merge_progress_url(self):
return reverse('data_importer:merge_progress', args=(self.pk,))
Expand Down
103 changes: 103 additions & 0 deletions seed/management/commands/delete_unused_uploaded_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
"""
:copyright (c) 2014 - 2022, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Department of Energy) and contributors. All rights reserved. # NOQA
:author
"""
from __future__ import unicode_literals

from django.core.management.base import BaseCommand

from seed.data_importer.models import ImportFile

import os


class Command(BaseCommand):
help = 'Creates a default super user for the system tied to an organization'

def add_arguments(self, parser):
parser.add_argument('--org_id',
default=None,
help='Name of a specific organization to operate',
action='store',
dest='org_id')

def handle(self, *args, **options):
# In local dev and production the files are in the media/uploads folder.
#
# Steps to process
# 1. The database has various paths of all the files as we have been
# moving the files around and the database paths have not been updated
# First task is to update the file paths to /seed/media/uploads for the files
# 2. There is a deleted column in SEED that is updated when a user removes the
# file from the frontend; however, SEED persists the file. This tasks
# deletes the files from disk (if they exist), but it does not delete the
# ImportFile record itself

org_id = options['org_id']
if org_id:
files = ImportFile.objects.get_all().filter(import_record__super_organization=org_id) # this actually returns a queryset
else:
files = ImportFile.objects.get_all()

# fix the file path if it is not /seed/media/uploads
# populate the list of files - this may need to be broken up into mulitple tasks on the real data
# based on how large the directory / table is.
rename_files = []
for f in files:
filename = f.file.name
if os.path.exists(filename):
# don't do anything if the path exists
continue

# the filename/path is not correct and needs to be updated
# put it in a tuple of db object, oldname, new name
new_base_path = "/seed/media/uploads"
if filename.startswith(new_base_path):
# don't do anything, file name is in the right format for
# docker mounted media
continue
elif filename == "":
# no file attached
continue
else:
if 'pm_imports/' in filename:
# this is a special folder that needs to persist in the uploads directory
rename_files.append((f, filename, f"{new_base_path}/pm_imports/{os.path.basename(filename)}"))
else:
rename_files.append((f, filename, f"{new_base_path}/{os.path.basename(filename)}"))

self.stdout.write('******** LIST OF IMPORT FILE PATHS TO RENAME *********')
for f in rename_files:
print(f"Will rename {f[1]} to {f[2]}")
self.stdout.write('******** END OF LIST (list may be blank) *********')
f = input("Are you sure you want to rename all of the files above? Use with caution! [Y/y]? ")
if f.lower() == 'y':
for f in rename_files:
f_db = f[0]
print(f"Renaming {f[1]} to {f[2]}")
f_db.file.name = f[2]
f_db.save()
self.stdout.write('Done renaming', ending='\n')
else:
self.stdout.write('Not renaming, will not continue, exiting')
exit()

# now go through and find the deleted=True and remove the records
if org_id:
files = ImportFile.objects.get_all().filter(deleted=True, import_record__super_organization=org_id).exclude(file__exact='')
else:
files = ImportFile.objects.get_all().filter(deleted=True).exclude(file__exact='')

f = input(f"Are you sure you want to delete {len(files)} InputFiles that have been marked with 'deleted'? Use with caution! [Y/y]? ")
if f.lower() == 'y':
for fil in files:
filename = fil.file.name
self.stdout.write(f"Deleting file {filename}")
# regardless if the file exists or not,
fil.file.delete(save=True)

self.stdout.write('Done deleting flagged record files, the records still exist', ending='\n')
else:
self.stdout.write('Not deleting, exiting')
exit()

0 comments on commit 10ead69

Please sign in to comment.