From f300f6a982184a0616e16de882ca408d4decf8d8 Mon Sep 17 00:00:00 2001 From: pbailie Date: Fri, 17 Aug 2018 19:18:22 -0400 Subject: [PATCH 1/2] Auto Feed Update student_auto_feed/submitty_student_auto_feed.php Refactor/Simplify/Improve deduplication code. Perhaps we don't need a stable sort in this particular project. --- .../submitty_student_auto_feed.php | 133 +++++------------- 1 file changed, 38 insertions(+), 95 deletions(-) diff --git a/student_auto_feed/submitty_student_auto_feed.php b/student_auto_feed/submitty_student_auto_feed.php index 7d2e449..4e41024 100644 --- a/student_auto_feed/submitty_student_auto_feed.php +++ b/student_auto_feed/submitty_student_auto_feed.php @@ -245,7 +245,12 @@ private function validate_csv($csv_data) { * be deduplicated. * ------------------------------------------------------------------ */ - deduplicate::deduplicate_data(self::$data['users'], 'user_id'); + if ($this->deduplicate('users', 'user_id') === false) { + + //Deduplication didn't work. We can't proceed. + $this->log_it("Users data deduplication encountered a problem. Aborting."); + $validation_flag = false; + } //TRUE: Data validation passed and validated data set will have at least 1 row per table. //FALSE: Either data validation failed or at least one table is an empty set. @@ -394,6 +399,38 @@ private function load_csv(&$csv_data) { return true; } + + /** + * deduplicate data set by a specific column + * + * Users table in "Submitty" database must have a unique student per row. + * per row. Students in multiple courses may have multiple entries where + * where deduplication is necessary. + * + * @access private + * @param array $subset data subset to be deduplicated + * @param mixed $key column by which rows are deduplicated + * @return boolean TRUE when deduplication is completed. FALSE when sorting fails. + */ + private function deduplicate($subset = 'users', $key = 'user_id') { + + // First, sort data subset. On success, remove duplicate rows identified by $key. + if (usort(self::$data[$subset], function($a, $b) use ($key) { return strcmp($a[$key], $b[$key]); }; )) { + $count = count(self::$data[$subset]); + for ($i = 1; $i < $count; $i++) { + if (self::$data[$subset][$i][$key] === self::$data[$subset][$i-1][$key]) { + unset(self::$data[$subset][$i-1]); + } + } + + //Indicate that deduplication is done. + return true; + } + + //Something went wrong during sort. Abort and indicate failure. + return false; + } + /** * "Update/Insert" data into the database. Code works via "batch" upserts. * @@ -684,100 +721,6 @@ private function log_it($msg) { } } -/** static class for deduplicating data */ -class deduplicate { - - /** - * deduplicate data by a specific column - * - * Users table in "Submitty" database must have a unique student per row. - * per row. Students in multiple courses may have multiple entries where - * where deduplication is necessary. - * - * @access public - * @param array $arr array to be deduplicated, passed by reference - * @param mixed $key column by which rows are deduplicated - */ - public static function deduplicate_data(&$arr, $key='user_id') { - - self::merge_sort($arr, $key); - self::dedup($arr, $key); - } - - /** - * merge sort - * - * PHP's built in sort is quicksort. It is not stable and cannot sort rows - * by column, and therefore is not sufficient. Data will be sorted to be - * deduplicated. - * - * @access private - * @param array $arr array of data rows to be sorted - * @param mixed $key column by which rows are sorted - */ - private static function merge_sort(&$arr, $key) { - - //Arrays of size < 2 require no action. - if (count($arr) < 2) { - return; - } - - //Split the array in half - $halfway = count($arr) / 2; - $arr1 = array_slice($arr, 0, $halfway); - $arr2 = array_slice($arr, $halfway); - - //Recurse to sort the two halves - self::merge_sort($arr1, $key); - self::merge_sort($arr2, $key); - - //If all of $array1 is <= all of $array2, just append them. - if (strcasecmp(end($arr1)[$key], $arr2[0][$key]) < 1) { - $arr = array_merge($arr1, $arr2); - return; - } - - //Merge the two sorted arrays into a single sorted array - $arr = array(); - $i = 0; - $j = 0; - while ($i < count($arr1) && $j < count($arr2)) { - if (strcasecmp($arr1[$i][$key], $arr2[$j][$key]) < 1) { - $arr[] = $arr1[$i]; - $i++; - } else { - $arr[] = $arr2[$j]; - $j++; - } - } - - //Merge the remainder - for (/* no var init */; $i < count($arr1); $i++) { - $arr[] = $arr1[$i]; - } - - for (/* no var init */; $j < count($arr2); $j++) { - $arr[] = $arr2[$j]; - } - } - - /** - * remove duplicated student rows - * - * @access private - * @param array $arr array of data rows to be deduplicated - * @param mixed $key column by which rows are deduplicated - */ - private static function dedup(&$arr, $key) { - - $count = count($arr); - for ($i = 1; $i < $count; $i++) { - if ($arr[$i][$key] === $arr[$i-1][$key]) { - unset($arr[$i-1]); - } - } - } -} /** @static class to parse command line arguments */ class cli_args { From 09907350e2714eee9c75c373846acfa70e307fe2 Mon Sep 17 00:00:00 2001 From: pbailie Date: Tue, 21 Aug 2018 20:03:01 -0400 Subject: [PATCH 2/2] auto_feed_update Deduplication now relies on usort() instead of merge sort. Should be faster. --- student_auto_feed/submitty_student_auto_feed.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/student_auto_feed/submitty_student_auto_feed.php b/student_auto_feed/submitty_student_auto_feed.php index 4e41024..a47fd4f 100644 --- a/student_auto_feed/submitty_student_auto_feed.php +++ b/student_auto_feed/submitty_student_auto_feed.php @@ -41,7 +41,7 @@ class submitty_student_auto_feed { public function __construct() { //Important: Make sure we are running from CLI - if (PHP_SAPI != "cli") { + if (PHP_SAPI !== "cli") { die("This is a command line tool."); } @@ -247,7 +247,7 @@ private function validate_csv($csv_data) { if ($this->deduplicate('users', 'user_id') === false) { - //Deduplication didn't work. We can't proceed. + //Deduplication didn't work. We can't proceed (set validation flag to false). $this->log_it("Users data deduplication encountered a problem. Aborting."); $validation_flag = false; } @@ -415,7 +415,7 @@ private function load_csv(&$csv_data) { private function deduplicate($subset = 'users', $key = 'user_id') { // First, sort data subset. On success, remove duplicate rows identified by $key. - if (usort(self::$data[$subset], function($a, $b) use ($key) { return strcmp($a[$key], $b[$key]); }; )) { + if (usort(self::$data[$subset], function($a, $b) use ($key) { return strcmp($a[$key], $b[$key]); })) { $count = count(self::$data[$subset]); for ($i = 1; $i < $count; $i++) { if (self::$data[$subset][$i][$key] === self::$data[$subset][$i-1][$key]) {