Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 40 additions & 32 deletions nightly_db_backup/db_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
:file: db_backup.py
:language: python3
:author: Peter Bailie (Systems Programmer, Dept. of Computer Science, RPI)
:date: August 22 2018
:date: August 28 2018

This script will take backup dumps of each individual Submitty course
database. This should be set up by a sysadmin to be run on the Submitty
Expand All @@ -30,6 +30,7 @@
"""

import argparse
import calendar
import datetime
import json
import os
Expand All @@ -43,14 +44,16 @@
# WHERE DUMP FILES ARE WRITTEN
DUMP_PATH = '/var/local/submitty/submitty-dumps'

def delete_obsolete_dumps(working_path, expiration_stamp):
def delete_obsolete_dumps(working_path, monthly_retention, expiration_date):
"""
Recurse through folders/files and delete any obsolete dump files

:param working_path: path to recurse through
:param expiration_stamp: date to begin purging old dump files
:type working_path: string
:type expiration_stamp: string
:param working_path: path to recurse through
:param monthly_retention: day of month that dump is always preserved (val < 1 when disabled)
:param expiration_date: date to begin purging old dump files
:type working_path: string
:type monthly_retention: integer
:type expiration_date: datetime.date object
"""

# Filter out '.', '..', and any "hidden" files/directories.
Expand All @@ -62,24 +65,26 @@ def delete_obsolete_dumps(working_path, expiration_stamp):
for file in files_list:
if os.path.isdir(file):
# If the file is a folder, recurse
delete_obsolete_dumps(file, expiration_stamp)
delete_obsolete_dumps(file, monthly_retention, expiration_date)
else:
# File date was concat'ed into the file's name. Use regex to isolate date from full path.
# e.g. "/var/local/submitty-dumps/s18/cs1000/180424_s18_cs1000.dbdump"
# The date substring can be located with high confidence by looking for:
# - final token of the full path (the actual file name)
# - file name consists of three tokens delimited by '_' chars
# - first token is exactly 6 digits, the date stamp.
# - second token is the semester code, at least one 'word' char
# - third token is the course code, at least one 'word' char
# - filename always ends in ".dbdump"
# - then take substring [0:6] to get "180424".
match = re.search('(\d{6}_\w+_\w+\.dbdump)$', file)
if match is not None:
file_date_stamp = match.group(0)[0:6]
if file_date_stamp <= expiration_stamp:
os.remove(file)

# Determine file's date from its filename
# Note: datetime.date.fromisoformat() doesn't exist in Python 3.6 or earlier.
filename = file.split('/')[-1]
datestamp = filename.split('_')[0]
year, month, day = map(int, datestamp.split('-'))
file_date = datetime.date(year, month, day)

# Conditions to NOT delete old file:
if file_date > expiration_date:
pass
elif file_date.day == monthly_retention:
pass
# A month can be as few as 28 days, but we NEVER skip months even when "-m" is 28, 29, 30, or 31.
elif monthly_retention > 28 and (file_date.day == calendar.monthrange(file_date.year, file_date.month)[1] and file_date.day <= monthly_retention):
pass
else:
# os.remove(file)
print("remove " + file)
def main():
""" Main """

Expand All @@ -89,18 +94,19 @@ def main():

# READ COMMAND LINE ARGUMENTS
# Note that -t and -g are different args and mutually exclusive
parser = argparse.ArgumentParser(description='Dump all Submitty databases for a particular academic term.')
parser.add_argument('-e', action='store', nargs='?', type=int, default=0, help='Set number of days expiration of older dumps (default: no expiration).', metavar='days')
parser = argparse.ArgumentParser(description='Dump all Submitty databases for a particular academic term.', prefix_chars='-', add_help=True)
parser.add_argument('-e', action='store', type=int, default=0, help='Set number of days expiration of older dumps (default: no expiration).', metavar='days')
parser.add_argument('-m', action='store', type=int, default=0, choices=range(0,32), help='Day of month to ALWAYS retain a dumpfile (default: no monthly retention).', metavar='day of month')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-t', action='store', nargs='?', type=str, help='Set the term code.', metavar='term code')
group.add_argument('-t', action='store', type=str, help='Set the term code.', metavar='term code')
group.add_argument('-g', action='store_true', help='Guess term code based on calender month and year.')

args = parser.parse_args()

# Get current date -- needed throughout the script, but also used when guessing default term code.
# (today.year % 100) determines the two digit year. e.g. '2017' -> '17'
today = datetime.date.today()
year = str(today.year % 100)
today_stamp = '{:0>2}{:0>2}{:0>2}'.format(year, today.month, today.day)
year = today.strftime("%y")
today_stamp = today.isoformat()

# PARSE COMMAND LINE ARGUMENTS
expiration = args.e
Expand All @@ -112,6 +118,9 @@ def main():
else:
semester = args.t

# MONTHLY RETENTION DATE
monthly_retention = args.m

# GET DATABASE CONFIG FROM SUBMITTY
fh = open(DB_CONFIG_PATH, "r")
db_config = json.load(fh)
Expand Down Expand Up @@ -170,12 +179,11 @@ def main():
# DETERMINE EXPIRATION DATE (to delete obsolete dump files)
# (do this BEFORE recursion so it is not calculated recursively n times)
if expiration > 0:
expiration_date = datetime.date.fromordinal(today.toordinal() - expiration)
expiration_stamp = '{:0>2}{:0>2}{:0>2}'.format(expiration_date.year % 100, expiration_date.month, expiration_date.day)
expiration_date = datetime.date.fromordinal(today.toordinal() - expiration)
working_path = "{}/{}".format(DUMP_PATH, semester)

# RECURSIVELY CULL OBSOLETE DUMPS
delete_obsolete_dumps(working_path, expiration_stamp)
delete_obsolete_dumps(working_path, monthly_retention, expiration_date)

if __name__ == "__main__":
main()
94 changes: 51 additions & 43 deletions nightly_db_backup/readme.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
# Nightly Database Backup Python Script
Readme June 26, 2018
Readme August 31, 2018

### db_backup.py
## db_backup.py

This script will read a course list, corresponding to a specific term, from
the 'master' Submitty database. With a course list, the script will use
Postgresql's "pg_dump" tool to retrieve a SQL dump of the submitty 'master'
database and each registered course's Submitty database of a specific semester.
The script also has cleanup functionality to automatically remove older dumps.

*db_backup.py is written in Python 3, and tested with Python 3.4.*
*db_backup.py is written in Python 3, and tested with Python 3.6.*

---
NOTE: Some modification of code may be necessary to work with your school's
information systems.

### FERPA Warning

WARNING: Database dumps can contain student information that is protected by
[FERPA (20 U.S.C. § 1232g)](https://www2.ed.gov/policy/gen/guid/fpco/ferpa/index.html).
Please consult with your school's IT dept. for advice on data security policies
and practices.

### Term Code

The term code can be specified as a command line argument as option `-t`.

Expand All @@ -28,70 +38,68 @@ current month and year of the server's date.

The term code will follow the pattern of TYY, where
- T is the term
- **s** is for Spring (Jan - May)
- **u** is for Summer (Jun - Jul)
- **f** is for Fall (Aug-Dec)
- YY is the two digit year
- `s` is for Spring (Jan - May)
- `u` is for Summer (Jun - Jul)
- `f` is for Fall (Aug-Dec)
- `YY` is the two digit year
- e.g. April 15, 2018 will correspond to "s18" (Spring 2018).

`-t` and `-g` are mutually exclusive.
`-t` and `-g` are mutually exclusive, but one is required.

---
### Date Stamp

Each dump has a date stamp in its name following the format of "YYMMD",
Each dump has a date stamp in its name following the format of `YYYY-MM-DD`,
followed by the semester code, then the course code.

e.g. '180423_s18_cs100.dbdump' is a dump taken on April 23, 2018 of the Spring
2018 semester for course CS-100.
e.g. `2018-04-23_s18_cs100.dbdump` is a dump taken on April 23, 2018 of the
Spring 2018 semester for course CS-100.

### Cleanup Schedule

Older dumps can be automatically purged with the command line option "-e".
Older dumps can be automatically purged with the command line option `-e`.

For example:

`python3 ./db_backup.py -t f17 -e 7`

will purge any dumps with a stamp seven days or older. Only dumps of the
term being processed will be purged, in this example, 'f17'.
term being processed will be purged, in this example, `f17`.

The default expiration value is 0 (no expiration -- no files are purged) should
this argument be ommitted.

---

Submitty databases can be restored from a dump using the pg_restore tool.
q.v. [https://www.postgresql.org/docs/9.5/static/app-pgrestore.html](https://www.postgresql.org/docs/9.5/static/app-pgrestore.html)

This is script intended to be run as a cronjob by 'root' on the same server
machine as the Submitty system. *Running this script on another server other
than Submitty has not been tested.*
### Monthly Retention

---
Command line option `-m` will set a monthly retention date. Dumps taken on that
date will not be purged. In the case the retention date is past the 28th, end
of month dumps will still be retained.

Please configure options near the top of the code.
e.g. `-m 30` will retain any dump on the 30th of the month. In the case of
February, dumps on the 28th, or 29th on a leap year, are also retained. Dumps
on the 31st of another month are not retained (as they were retained on the
30th).

DB_HOST: Hostname of the Submitty databases. You may use 'localhost' if
Postgresql is on the same machine as the Submitty system.
For clarification: `-m 31` will retain dumps taken on February 28/29;
April, June, September, November 30; and January, March, May, July, August,
October, December 31.

DB_USER: The username that interacts with Submitty databases. Typically
'hsdbu'.
No monthly retention occurs if `-m` is omitted or set `-m 0`.

DB_PASS: The password for Submitty's database account (e.g. account 'hsdbu').
**Do NOT use the placeholder value of 'DB.p4ssw0rd'**
### Restore a Dump

DUMP_PATH: The folder path to store the database dumps. Course folders will
be created from this path, and the dumps stored in their respective course
folders, grouped by semester.
Submitty databases can be restored from a dump using the pg_restore tool.
q.v. [https://www.postgresql.org/docs/10/static/app-pgrestore.html](https://www.postgresql.org/docs/10/static/app-pgrestore.html)

---
### Cron

WARNING: Database dumps can contain student information that is protected by
[FERPA (20 U.S.C. § 1232g)](https://www2.ed.gov/policy/gen/guid/fpco/ferpa/index.html).
Please consult with your school's IT dept. for advice on data security policies
and practices.
This is script intended to be run as a cronjob by 'root' on the same server
machine as the Submitty system. *Running this script on another server other
than Submitty has not been tested.*

---
### Options At The Top Of The Code

db_backup.py is tested to run on Python 3.4 or higher.
`DB_CONFIG_PATH` looks for Submitty's `database.json` file that contains
database authentication information. Leaving this at the default is usually OK.

NOTE: Some modification of code may be necessary to work with your school's
information systems.
`DUMP_PATH` indicates where dump files are stored. Only change this if the
default location is undesirable for your server.