diff --git a/nightly_db_backup/db_backup.py b/nightly_db_backup/db_backup.py index 5cc8538..dffdb81 100644 --- a/nightly_db_backup/db_backup.py +++ b/nightly_db_backup/db_backup.py @@ -5,7 +5,7 @@ :file: db_backup.py :language: python3 :author: Peter Bailie (Systems Programmer, Dept. of Computer Science, RPI) -:date: August 22 2018 +:date: August 28 2018 This script will take backup dumps of each individual Submitty course database. This should be set up by a sysadmin to be run on the Submitty @@ -30,6 +30,7 @@ """ import argparse +import calendar import datetime import json import os @@ -43,14 +44,16 @@ # WHERE DUMP FILES ARE WRITTEN DUMP_PATH = '/var/local/submitty/submitty-dumps' -def delete_obsolete_dumps(working_path, expiration_stamp): +def delete_obsolete_dumps(working_path, monthly_retention, expiration_date): """ Recurse through folders/files and delete any obsolete dump files - :param working_path: path to recurse through - :param expiration_stamp: date to begin purging old dump files - :type working_path: string - :type expiration_stamp: string + :param working_path: path to recurse through + :param monthly_retention: day of month that dump is always preserved (val < 1 when disabled) + :param expiration_date: date to begin purging old dump files + :type working_path: string + :type monthly_retention: integer + :type expiration_date: datetime.date object """ # Filter out '.', '..', and any "hidden" files/directories. @@ -62,24 +65,26 @@ def delete_obsolete_dumps(working_path, expiration_stamp): for file in files_list: if os.path.isdir(file): # If the file is a folder, recurse - delete_obsolete_dumps(file, expiration_stamp) + delete_obsolete_dumps(file, monthly_retention, expiration_date) else: - # File date was concat'ed into the file's name. Use regex to isolate date from full path. - # e.g. "/var/local/submitty-dumps/s18/cs1000/180424_s18_cs1000.dbdump" - # The date substring can be located with high confidence by looking for: - # - final token of the full path (the actual file name) - # - file name consists of three tokens delimited by '_' chars - # - first token is exactly 6 digits, the date stamp. - # - second token is the semester code, at least one 'word' char - # - third token is the course code, at least one 'word' char - # - filename always ends in ".dbdump" - # - then take substring [0:6] to get "180424". - match = re.search('(\d{6}_\w+_\w+\.dbdump)$', file) - if match is not None: - file_date_stamp = match.group(0)[0:6] - if file_date_stamp <= expiration_stamp: - os.remove(file) - + # Determine file's date from its filename + # Note: datetime.date.fromisoformat() doesn't exist in Python 3.6 or earlier. + filename = file.split('/')[-1] + datestamp = filename.split('_')[0] + year, month, day = map(int, datestamp.split('-')) + file_date = datetime.date(year, month, day) + + # Conditions to NOT delete old file: + if file_date > expiration_date: + pass + elif file_date.day == monthly_retention: + pass + # A month can be as few as 28 days, but we NEVER skip months even when "-m" is 28, 29, 30, or 31. + elif monthly_retention > 28 and (file_date.day == calendar.monthrange(file_date.year, file_date.month)[1] and file_date.day <= monthly_retention): + pass + else: +# os.remove(file) + print("remove " + file) def main(): """ Main """ @@ -89,18 +94,19 @@ def main(): # READ COMMAND LINE ARGUMENTS # Note that -t and -g are different args and mutually exclusive - parser = argparse.ArgumentParser(description='Dump all Submitty databases for a particular academic term.') - parser.add_argument('-e', action='store', nargs='?', type=int, default=0, help='Set number of days expiration of older dumps (default: no expiration).', metavar='days') + parser = argparse.ArgumentParser(description='Dump all Submitty databases for a particular academic term.', prefix_chars='-', add_help=True) + parser.add_argument('-e', action='store', type=int, default=0, help='Set number of days expiration of older dumps (default: no expiration).', metavar='days') + parser.add_argument('-m', action='store', type=int, default=0, choices=range(0,32), help='Day of month to ALWAYS retain a dumpfile (default: no monthly retention).', metavar='day of month') group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('-t', action='store', nargs='?', type=str, help='Set the term code.', metavar='term code') + group.add_argument('-t', action='store', type=str, help='Set the term code.', metavar='term code') group.add_argument('-g', action='store_true', help='Guess term code based on calender month and year.') + args = parser.parse_args() # Get current date -- needed throughout the script, but also used when guessing default term code. - # (today.year % 100) determines the two digit year. e.g. '2017' -> '17' today = datetime.date.today() - year = str(today.year % 100) - today_stamp = '{:0>2}{:0>2}{:0>2}'.format(year, today.month, today.day) + year = today.strftime("%y") + today_stamp = today.isoformat() # PARSE COMMAND LINE ARGUMENTS expiration = args.e @@ -112,6 +118,9 @@ def main(): else: semester = args.t + # MONTHLY RETENTION DATE + monthly_retention = args.m + # GET DATABASE CONFIG FROM SUBMITTY fh = open(DB_CONFIG_PATH, "r") db_config = json.load(fh) @@ -170,12 +179,11 @@ def main(): # DETERMINE EXPIRATION DATE (to delete obsolete dump files) # (do this BEFORE recursion so it is not calculated recursively n times) if expiration > 0: - expiration_date = datetime.date.fromordinal(today.toordinal() - expiration) - expiration_stamp = '{:0>2}{:0>2}{:0>2}'.format(expiration_date.year % 100, expiration_date.month, expiration_date.day) + expiration_date = datetime.date.fromordinal(today.toordinal() - expiration) working_path = "{}/{}".format(DUMP_PATH, semester) # RECURSIVELY CULL OBSOLETE DUMPS - delete_obsolete_dumps(working_path, expiration_stamp) + delete_obsolete_dumps(working_path, monthly_retention, expiration_date) if __name__ == "__main__": main() diff --git a/nightly_db_backup/readme.md b/nightly_db_backup/readme.md index e6c6a0c..fa21849 100644 --- a/nightly_db_backup/readme.md +++ b/nightly_db_backup/readme.md @@ -1,7 +1,7 @@ # Nightly Database Backup Python Script -Readme June 26, 2018 +Readme August 31, 2018 -### db_backup.py +## db_backup.py This script will read a course list, corresponding to a specific term, from the 'master' Submitty database. With a course list, the script will use @@ -9,9 +9,19 @@ Postgresql's "pg_dump" tool to retrieve a SQL dump of the submitty 'master' database and each registered course's Submitty database of a specific semester. The script also has cleanup functionality to automatically remove older dumps. -*db_backup.py is written in Python 3, and tested with Python 3.4.* +*db_backup.py is written in Python 3, and tested with Python 3.6.* ---- +NOTE: Some modification of code may be necessary to work with your school's +information systems. + +### FERPA Warning + +WARNING: Database dumps can contain student information that is protected by +[FERPA (20 U.S.C. § 1232g)](https://www2.ed.gov/policy/gen/guid/fpco/ferpa/index.html). +Please consult with your school's IT dept. for advice on data security policies +and practices. + +### Term Code The term code can be specified as a command line argument as option `-t`. @@ -28,70 +38,68 @@ current month and year of the server's date. The term code will follow the pattern of TYY, where - T is the term - - **s** is for Spring (Jan - May) - - **u** is for Summer (Jun - Jul) - - **f** is for Fall (Aug-Dec) -- YY is the two digit year + - `s` is for Spring (Jan - May) + - `u` is for Summer (Jun - Jul) + - `f` is for Fall (Aug-Dec) +- `YY` is the two digit year - e.g. April 15, 2018 will correspond to "s18" (Spring 2018). -`-t` and `-g` are mutually exclusive. +`-t` and `-g` are mutually exclusive, but one is required. ---- +### Date Stamp -Each dump has a date stamp in its name following the format of "YYMMD", +Each dump has a date stamp in its name following the format of `YYYY-MM-DD`, followed by the semester code, then the course code. -e.g. '180423_s18_cs100.dbdump' is a dump taken on April 23, 2018 of the Spring -2018 semester for course CS-100. +e.g. `2018-04-23_s18_cs100.dbdump` is a dump taken on April 23, 2018 of the +Spring 2018 semester for course CS-100. + +### Cleanup Schedule -Older dumps can be automatically purged with the command line option "-e". +Older dumps can be automatically purged with the command line option `-e`. For example: `python3 ./db_backup.py -t f17 -e 7` will purge any dumps with a stamp seven days or older. Only dumps of the -term being processed will be purged, in this example, 'f17'. +term being processed will be purged, in this example, `f17`. The default expiration value is 0 (no expiration -- no files are purged) should this argument be ommitted. ---- - -Submitty databases can be restored from a dump using the pg_restore tool. -q.v. [https://www.postgresql.org/docs/9.5/static/app-pgrestore.html](https://www.postgresql.org/docs/9.5/static/app-pgrestore.html) - -This is script intended to be run as a cronjob by 'root' on the same server -machine as the Submitty system. *Running this script on another server other -than Submitty has not been tested.* +### Monthly Retention ---- +Command line option `-m` will set a monthly retention date. Dumps taken on that +date will not be purged. In the case the retention date is past the 28th, end +of month dumps will still be retained. -Please configure options near the top of the code. +e.g. `-m 30` will retain any dump on the 30th of the month. In the case of +February, dumps on the 28th, or 29th on a leap year, are also retained. Dumps +on the 31st of another month are not retained (as they were retained on the +30th). -DB_HOST: Hostname of the Submitty databases. You may use 'localhost' if -Postgresql is on the same machine as the Submitty system. +For clarification: `-m 31` will retain dumps taken on February 28/29; +April, June, September, November 30; and January, March, May, July, August, +October, December 31. -DB_USER: The username that interacts with Submitty databases. Typically -'hsdbu'. +No monthly retention occurs if `-m` is omitted or set `-m 0`. -DB_PASS: The password for Submitty's database account (e.g. account 'hsdbu'). -**Do NOT use the placeholder value of 'DB.p4ssw0rd'** +### Restore a Dump -DUMP_PATH: The folder path to store the database dumps. Course folders will -be created from this path, and the dumps stored in their respective course -folders, grouped by semester. +Submitty databases can be restored from a dump using the pg_restore tool. +q.v. [https://www.postgresql.org/docs/10/static/app-pgrestore.html](https://www.postgresql.org/docs/10/static/app-pgrestore.html) ---- +### Cron -WARNING: Database dumps can contain student information that is protected by -[FERPA (20 U.S.C. § 1232g)](https://www2.ed.gov/policy/gen/guid/fpco/ferpa/index.html). -Please consult with your school's IT dept. for advice on data security policies -and practices. +This is script intended to be run as a cronjob by 'root' on the same server +machine as the Submitty system. *Running this script on another server other +than Submitty has not been tested.* ---- +### Options At The Top Of The Code -db_backup.py is tested to run on Python 3.4 or higher. +`DB_CONFIG_PATH` looks for Submitty's `database.json` file that contains +database authentication information. Leaving this at the default is usually OK. -NOTE: Some modification of code may be necessary to work with your school's -information systems. +`DUMP_PATH` indicates where dump files are stored. Only change this if the +default location is undesirable for your server.