From 56c9c205f0e391eb29a8decfac433fedc55aa5f6 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Fri, 25 Jun 2021 08:19:25 -0400
Subject: [PATCH 01/52] Initial rewrite of process_all.sh

---
 bin/process_all.sh | 62 +++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 42 deletions(-)

diff --git a/bin/process_all.sh b/bin/process_all.sh
index 832ada9..c01a3fc 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -1,47 +1,25 @@
-#!/bin/bash
+# This script is the startup script for Lichen.  It accepts a single path to a
+# directory containing a config file and creates the necessary output directories
+# as appropriate, relative to the provided path.  It is possible to run this script
+# from the command line but it is meant to be run via the Plagiarism Detection UI.
 
-semester=$1
-course=$2
-gradeable=$3
+# TODO: Assert permissions, as necessary
 
-prev_argument=""
-prior_term_gradeables=()
-ignore_submissions=()
-for argument in "$@"
-do
-	if [[ $argument == --* ]]
-	then
-		prev_argument=$argument
-	else
-	    case $prev_argument in
-		"--language")
-			language=$argument
-		  	;;
-		"--window")
-		  	window=$argument
-		  	;;
-		"--threshold")
-		  	threshold=$argument
-		  	;;
-		"--regrex")
-		  	regrex=$argument
-		  	;;
-		"--provided_code_path")
-		  	provided_code_path=$argument
-		  	;;
-		"--prior_term_gradeables")
-			prior_term_gradeables+=("$argument")
-		  	;;
-		"--ignore_submissions")
-			ignore_submissions+=("$argument")
-		  	;;
-		esac
-	fi
-done
+basepath=$1 # holds the path to a directory containing a config for this gradeable
 
-/usr/local/submitty/Lichen/bin/concatenate_all.py  $semester $course $gradeable
-/usr/local/submitty/Lichen/bin/tokenize_all.py     $semester $course $gradeable  --${language}
-/usr/local/submitty/Lichen/bin/hash_all.py         $semester $course $gradeable  --window $window  --${language}
+# kill the script if there is no config file
+if [! -f "${basepath}/config.json" ]; then
+    echo "Unable to find config.json in provided directory"
+		exit 1
+fi
 
-/usr/local/submitty/Lichen/bin/compare_hashes.out  $semester $course $gradeable  --window $window
+# provided_code should already exist if the user wishes to run with provided code
+mkdir -p "${basepath}/logs"
+mkdir -p "${basepath}/other_gradeables"
+mkdir -p "${basepath}/users"
 
+
+/usr/local/submitty/Lichen/bin/concatenate_all.py  $basepath
+#/usr/local/submitty/Lichen/bin/tokenize_all.py     $basepath
+#/usr/local/submitty/Lichen/bin/hash_all.py         $basepath
+#/usr/local/submitty/Lichen/bin/compare_hashes.out  $basepath

From 198790d01f0d527e75b04412d207aeb2a1e8814f Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Fri, 25 Jun 2021 09:40:41 -0400
Subject: [PATCH 02/52] Update process_all.sh

---
 bin/process_all.sh | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/bin/process_all.sh b/bin/process_all.sh
index c01a3fc..24629ae 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -6,20 +6,24 @@
 # TODO: Assert permissions, as necessary
 
 basepath=$1 # holds the path to a directory containing a config for this gradeable
+datapath=$2 # holds the path to a directory conatining courses and their data
+            # (probably /var/local/submitty/courses on Submitty)
 
 # kill the script if there is no config file
-if [! -f "${basepath}/config.json" ]; then
+if [ ! -f "${basepath}/config.json" ]; then
     echo "Unable to find config.json in provided directory"
 		exit 1
 fi
 
-# provided_code should already exist if the user wishes to run with provided code
+# create these directories if they don't already exist
 mkdir -p "${basepath}/logs"
+mkdir -p "${basepath}/provided_code"
+mkdir -p "${basepath}/provided_code/files"
 mkdir -p "${basepath}/other_gradeables"
 mkdir -p "${basepath}/users"
 
-
-/usr/local/submitty/Lichen/bin/concatenate_all.py  $basepath
-#/usr/local/submitty/Lichen/bin/tokenize_all.py     $basepath
-#/usr/local/submitty/Lichen/bin/hash_all.py         $basepath
-#/usr/local/submitty/Lichen/bin/compare_hashes.out  $basepath
+# run all of the modules and exit if an error occurs
+/usr/local/submitty/Lichen/bin/concatenate_all.py  $basepath $datapath || exit 1
+#/usr/local/submitty/Lichen/bin/tokenize_all.py     $basepath || exit 1
+#/usr/local/submitty/Lichen/bin/hash_all.py         $basepath || exit 1
+#/usr/local/submitty/Lichen/bin/compare_hashes.out  $basepath || exit 1

From 8737f557ba968d32b384f5080efa9667760a4320 Mon Sep 17 00:00:00 2001
From: sbelsk <shellybelsky02@gmail.com>
Date: Fri, 25 Jun 2021 10:54:45 -0400
Subject: [PATCH 03/52] Make modifications to file paths and add timers

---
 compare_hashes/compare_hashes.cpp | 98 ++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 42 deletions(-)

diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
index e15ef0c..b283e88 100644
--- a/compare_hashes/compare_hashes.cpp
+++ b/compare_hashes/compare_hashes.cpp
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <set>
 #include <iomanip>
+#include <time.h>
 
 #include "boost/filesystem/operations.hpp"
 #include "boost/filesystem/path.hpp"
@@ -157,17 +158,20 @@ bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
 // ===================================================================================
 // ===================================================================================
 int main(int argc, char* argv[]) {
-
   std::cout << "COMPARE HASHES...";
   fflush(stdout);
+  time_t overall_start, overall_end;
+  time(&overall_start);
 
   // ---------------------------------------------------------------------------
   // deal with command line arguments
 
   assert (argc == 2);
-  std::string config_file = argv[1];
+  std::string lichen_gradeable_path_str = argv[1];
+  boost::filesystem::path lichen_gradeable_path = boost::filesystem::system_complete(lichen_gradeable_path_str);
+  boost::filesystem::path config_file_json_path = lichen_gradeable_path / "config.json";
 
-  std::ifstream istr(config_file.c_str());
+  std::ifstream istr(config_file_json_path.string());
   assert (istr.good());
   nlohmann::json config_file_json = nlohmann::json::parse(istr);
 
@@ -181,17 +185,15 @@ int main(int argc, char* argv[]) {
   assert (threshold >= 2);
 
   // error checking, confirm there are hashes to work with
-  std::string tmp = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/hashes/"+gradeable;
-  boost::filesystem::path hashes_root_directory = boost::filesystem::system_complete(tmp);
-  if (!boost::filesystem::exists(hashes_root_directory) ||
-      !boost::filesystem::is_directory(hashes_root_directory)) {
-    std::cerr << "ERROR with directory " << hashes_root_directory << std::endl;
+  boost::filesystem::path users_root_directory = lichen_gradeable_path / "users";
+  if (!boost::filesystem::exists(users_root_directory) ||
+      !boost::filesystem::is_directory(users_root_directory)) {
+    std::cerr << "ERROR with directory " << users_root_directory << std::endl;
     exit(0);
   }
 
   // the file path where we expect to find the hashed instructor provided code file
-  std::string tmp2 = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/hashes/"+gradeable+"/provided_code/provided_code/hashes.txt";
-  boost::filesystem::path provided_code_file = boost::filesystem::system_complete(tmp2);
+  boost::filesystem::path provided_code_file = lichen_gradeable_path / "provided_code" / "hashes.txt";
   // if file exists in that location, the provided code mode is enabled.
   bool provided_code_enabled = boost::filesystem::exists(provided_code_file);
 
@@ -205,25 +207,26 @@ int main(int argc, char* argv[]) {
   // Stores all hashes from the instructor provided code
   std::unordered_set<hash> provided_code;
 
+  time_t start, end;
+  time(&start);
+
+  if (provided_code_enabled) {
+    // load the instructor provided code's hashes
+    std::ifstream istr(provided_code_file.string());
+    assert(istr.good());
+    hash instructor_hash;
+    while (istr >> instructor_hash) {
+      provided_code.insert(instructor_hash);
+    }
+  }
+
   // loop over all users
   boost::filesystem::directory_iterator end_iter;
-  for (boost::filesystem::directory_iterator dir_itr( hashes_root_directory ); dir_itr != end_iter; ++dir_itr) {
+  for (boost::filesystem::directory_iterator dir_itr( users_root_directory ); dir_itr != end_iter; ++dir_itr) {
     boost::filesystem::path username_path = dir_itr->path();
     assert (is_directory(username_path));
     std::string username = dir_itr->path().filename().string();
 
-    if (username == "provided_code") {
-      assert(provided_code_enabled);
-
-      // load the instructor provided code's hashes
-      std::ifstream istr(provided_code_file.string());
-      hash instructor_hash;
-      while (istr >> instructor_hash) {
-        provided_code.insert(instructor_hash);
-      }
-      continue;
-    }
-
     // loop over all versions
     for (boost::filesystem::directory_iterator username_itr( username_path ); username_itr != end_iter; ++username_itr) {
       boost::filesystem::path version_path = username_itr->path();
@@ -239,6 +242,7 @@ int main(int argc, char* argv[]) {
       boost::filesystem::path hash_file = version_path;
       hash_file /= "hashes.txt";
       std::ifstream istr(hash_file.string());
+      assert(istr.good());
       hash input_hash;
       int location = 0;
       while (istr >> input_hash) {
@@ -251,8 +255,9 @@ int main(int argc, char* argv[]) {
     }
   }
 
-
-  std::cout << "finished loading" << std::endl;
+  time(&end);
+  double diff = difftime(end, start);
+  std::cout << "finished loading in " << diff  << "s" << std::endl;
 
   // ---------------------------------------------------------------------------
   // THIS IS THE MAIN PLAGIARISM DETECTION ALGORITHM
@@ -260,6 +265,7 @@ int main(int argc, char* argv[]) {
   // Used to calculate current progress (printed to the log)
   int my_counter = 0;
   int my_percent = 0;
+  time(&start);
 
   // walk over every Submission
   for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
@@ -317,7 +323,9 @@ int main(int argc, char* argv[]) {
     }
   }
 
-  std::cout << "finished walking" << std::endl;
+  time(&end);
+  diff = difftime(end, start);
+  std::cout << "finished walking in " << diff << "s" << std::endl;
 
   // ---------------------------------------------------------------------------
   // Writing the output files and merging the results
@@ -325,6 +333,7 @@ int main(int argc, char* argv[]) {
   my_counter = 0;
   my_percent = 0;
   std::cout << "writing matches files and merging regions..." << std::endl;
+  time(&start);
 
   // Loop over all of the submissions, writing a JSON file for each one if it has suspicious matches
   for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
@@ -487,11 +496,10 @@ int main(int argc, char* argv[]) {
 
     // save the file with matches per user
     nlohmann::json match_data = result;
-    std::string matches_dir = "/var/local/submitty/courses/"+semester+"/"+course
-        +"/lichen/matches/"+gradeable+"/"+submission_itr->student()+"/"+std::to_string(submission_itr->version());
-    boost::filesystem::create_directories(matches_dir);
-    std::string matches_file = matches_dir+"/matches.json";
-    std::ofstream ostr(matches_file);
+    boost::filesystem::path submission_dir = users_root_directory / submission_itr->student() / std::to_string(submission_itr->version());
+    boost::filesystem::create_directories(submission_dir);
+    boost::filesystem::path matches_file = submission_dir / "matches.json";
+    std::ofstream ostr(matches_file.string());
     assert(ostr.good());
     ostr << match_data.dump(4) << std::endl;
 
@@ -503,16 +511,19 @@ int main(int argc, char* argv[]) {
     }
 
   }
-  std::cout << "done merging and writing matches files" << std::endl;
+
+  time(&end);
+  diff = difftime(end, start);
+  std::cout << "done merging and writing matches files in " << diff << "s" << std::endl;
 
   // ---------------------------------------------------------------------------
   // Create a general summary of rankings of users by percentage match
+  std::cout << "writing rakings files..." << std::endl;
+  time(&start);
 
   // create a single file of students ranked by highest percentage of code plagiarised
-  std::string ranking_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/"+gradeable+"/";
-  std::string ranking_file = ranking_dir+"overall_ranking.txt";
-  boost::filesystem::create_directories(ranking_dir);
-  std::ofstream ranking_ostr(ranking_file);
+  boost::filesystem::path ranking_file = lichen_gradeable_path / "overall_ranking.txt";
+  std::ofstream ranking_ostr(ranking_file.string());
 
   // a map of students to a pair of the version and highest percent match for each student
   std::unordered_map<std::string, std::pair<int, float> > highest_matches;
@@ -596,11 +607,10 @@ int main(int argc, char* argv[]) {
     std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);
 
     // create the directory and a file to write into
-    std::string ranking_student_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/"
-                                      +gradeable+"/"+submission_itr->student()+"/"+std::to_string(submission_itr->version())+"/";
-    std::string ranking_student_file = ranking_student_dir+submission_itr->student()+"_"+std::to_string(submission_itr->version())+".txt";
+    boost::filesystem::path ranking_student_dir = users_root_directory / submission_itr->student() / std::to_string(submission_itr->version());
+    boost::filesystem::path ranking_student_file = ranking_student_dir / "ranking.txt";
     boost::filesystem::create_directories(ranking_student_dir);
-    std::ofstream ranking_student_ostr(ranking_student_file);
+    std::ofstream ranking_student_ostr(ranking_student_file.string());
 
     // finally, write the file of ranking for this submission
     for (unsigned int i = 0; i < student_ranking.size(); i++) {
@@ -610,10 +620,14 @@ int main(int argc, char* argv[]) {
         << std::setw(3) << std::right << student_ranking[i].version << std::endl;
     }
   }
-  
 
+  time(&end);
+  diff = difftime(end, start);
+  std::cout << "finished writing rankings in " << diff << "s" << std::endl;
 
   // ---------------------------------------------------------------------------
-  std::cout << "done" << std::endl;
+  time(&overall_end);
+  double overall_diff = difftime(overall_end, overall_start);
+  std::cout << "DONE in " << overall_diff << "s" << std::endl;
 
 }

From acf48422f0d02276f454eef33541a7533b2761ab Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Fri, 25 Jun 2021 10:57:26 -0400
Subject: [PATCH 04/52] Overhaul concatenate_all.py

---
 bin/concatenate_all.py | 183 ++++++++++++++++++-----------------------
 1 file changed, 81 insertions(+), 102 deletions(-)

diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
index 3a1e556..eab7aa5 100644
--- a/bin/concatenate_all.py
+++ b/bin/concatenate_all.py
@@ -8,52 +8,73 @@
 import os
 import json
 import sys
-import shutil
+import time
 import fnmatch
 
-CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
-with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
-    OPEN_JSON = json.load(open_file)
-SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
-SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
-
 IGNORED_FILES = [
     ".submit.timestamp"
 ]
 
 
+# returns a string containing the contents of the files which match the regex in the specified dir
+def getConcatFilesInDir(input_dir, regex_patterns):
+    result = ""
+    for my_dir, _dirs, my_files in os.walk(input_dir):
+        # Determine if regex should be used (blank regex is equivalent to selecting all files)
+        files = sorted(my_files)
+        if regex_expressions[0] != "":
+            files_filtered = []
+            for e in regex_patterns:
+                files_filtered.extend(fnmatch.filter(files, e.strip()))
+            files = files_filtered
+
+        for my_file in files:
+            # exclude any files we have ignored for all submissions
+            if my_file in IGNORED_FILES:
+                continue
+            absolute_path = os.path.join(my_dir, my_file)
+            # print a separator & filename
+            with open(absolute_path, encoding='ISO-8859-1') as tmp:
+                result += f"=============== {my_file} ===============\n"
+                # append the contents of the file
+                result += tmp.read() + "\n"
+    return result
+
+
 def parse_args():
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("config_path")
+    parser.add_argument("basepath")
+    parser.add_argument("datapath")
     return parser.parse_args()
 
 
 def main():
+    start_time = time.time()
     args = parse_args()
 
-    sys.stdout.write("CONCATENATE ALL...")
+    sys.stdout.write("CONCATENATE ALL...")  # don't want a newline here so can't use print
     sys.stdout.flush()
 
-    with open(args.config_path) as lichen_config:
-        lichen_config_data = json.load(lichen_config)
-        semester = lichen_config_data["semester"]
-        course = lichen_config_data["course"]
-        gradeable = lichen_config_data["gradeable"]
-        users_to_ignore = lichen_config_data["ignore_submissions"]
+    config_path = args.basepath + '/config.json'
+    if not os.path.isfile(config_path):
+        print(f"Error: invalid config path provided ({config_path})")
+        exit(1)
 
-        # this assumes regex is seperated by a ','
-        regex_expressions = lichen_config_data["regex"].split(',')
-        regex_dirs = lichen_config_data["regex_dirs"]
+    with open(config_path) as config_file:
+        config = json.load(config_file)
+
+    semester = config["semester"]
+    course = config["course"]
+    gradeable = config["gradeable"]
+    users_to_ignore = config["ignore_submissions"]
+    regex_patterns = config["regex"].split(',')
+    regex_dirs = config["regex_dirs"]
 
     # ==========================================================================
-    # error checking
-    course_dir = os.path.join(SUBMITTY_DATA_DIR, "courses", semester, course)
-    if not os.path.isdir(course_dir):
-        print("ERROR! ", course_dir, " is not a valid course directory")
-        exit(1)
+    # Error checking
 
-    for e in regex_expressions:
-        # Check for backwards crawling
+    # Check for backwards crawling
+    for e in regex_patterns:
         if ".." in e:
             print('ERROR! Invalid path component ".." in regex')
             exit(1)
@@ -64,99 +85,57 @@ def main():
             exit(1)
 
     # ==========================================================================
-    # create the directory
-    concatenated_dir = os.path.join(course_dir, "lichen", "concatenated", gradeable)
-    if not os.path.isdir(concatenated_dir):
-        os.makedirs(concatenated_dir)
-
-    # ==========================================================================
-    count_total_files = 0
+    # loop through and concatenate the selected files for each user in this gradeable
 
     for dir in regex_dirs:
-        submission_dir = os.path.join(course_dir, dir, gradeable)
-
-        # more error checking
-        if not os.path.isdir(submission_dir):
-            print("ERROR! ", submission_dir, " is not a valid gradeable ", dir, " directory")
-            exit(1)
-
-        # =========================================================================
-        # walk the subdirectories
-        for user in sorted(os.listdir(submission_dir)):
-            if not os.path.isdir(os.path.join(submission_dir, user)):
+        gradeable_path = os.path.join(datapath, semester, course, dir, gradeable)
+        # loop over each user
+        for user in sorted(os.listdir(gradeable_path)):
+            user_path = os.path.join(gradeable_path, user)
+            if not os.path.isdir(user_path):
                 continue
             elif user in users_to_ignore:
                 continue
-            for version in sorted(os.listdir(os.path.join(submission_dir, user))):
-                if not os.path.isdir(os.path.join(submission_dir, user, version)):
+
+            # loop over each version
+            for version in sorted(os.listdir(user_path)):
+                version_path = os.path.join(user_path, version)
+                if not os.path.isdir(version_path):
                     continue
 
-                # -----------------------------------------------------------------
-                # concatenate all files for this submissison into a single file
-                my_concatenated_dir = os.path.join(concatenated_dir, user, version)
-                if not os.path.isdir(my_concatenated_dir):
-                    os.makedirs(my_concatenated_dir)
-                my_concatenated_file = os.path.join(my_concatenated_dir, "submission.concatenated")
-
-                with open(my_concatenated_file, 'a') as my_cf:
-                    # loop over all files in all subdirectories
-                    base_path = os.path.join(submission_dir, user, version)
-                    for my_dir, _dirs, my_files in os.walk(base_path):
-                        # Determine if regex should be used (no regex provided
-                        # is equivalent to selecting all files)
-                        files = sorted(my_files)
-                        if regex_expressions[0] != "":
-                            files_filtered = []
-                            for e in regex_expressions:
-                                files_filtered.extend(fnmatch.filter(files, e.strip()))
-                            files = files_filtered
-
-                        for my_file in files:
-                            # exclude any files we have ignored for all submissions
-                            if my_file in IGNORED_FILES:
-                                continue
-                            absolute_path = os.path.join(my_dir, my_file)
-                            # print a separator & filename
-                            my_cf.write(f"=============== {my_file} ===============\n")
-                            with open(absolute_path, encoding='ISO-8859-1') as tmp:
-                                # append the contents of the file
-                                my_cf.write(tmp.read())
-                                my_cf.write("\n")
-                            count_total_files += 1
+                output_file_path = os.path.join(args.basepath, user,
+                                                version, "submission.concatenated")
+
+                if not os.path.exists(os.path.dirname(output_file_path)):
+                    os.makedirs(os.path.dirname(output_file_path))
+
+                # append to concatenated file
+                with open(output_file_path, "a") as output_file:
+                    concatenated_contents = getConcatFilesInDir(version_path, regex_patterns)
+                    output_file.write(concatenated_contents)
+
+
     # ==========================================================================
-    # iterate over all of the created submissions, checking to see if they are
+    # iterate over all of the created submissions, checking to see if they are empty
     # and adding a message to the top if so (to differentiate empty files from errors in the UI)
-    for user in os.listdir(concatenated_dir):
-        for version in os.listdir(os.path.join(concatenated_dir, user)):
-            my_concatenated_file = os.path.join(concatenated_dir,
-                                                user, version, "submission.concatenated")
+    for user in os.listdir(os.path.join(args.basepath, "users")):
+        user_path = os.path.join(args.basepath, "users", user)
+        for version in os.listdir(user_path):
+            version_path = user_path = os.path.join(user_path, version)
+            my_concatenated_file = os.path.join(version_path, "submission.concatenated")
             with open(my_concatenated_file, "r+") as my_cf:
                 if my_cf.read() == "":
                     my_cf.write("Error: No files matched provided regex in selected directories")
 
     # ==========================================================================
-    # concatenate any files in the provided_code directory
-    provided_code_path = os.path.join(course_dir, "lichen", "provided_code", gradeable)
-    output_dir = os.path.join(course_dir, "lichen", "concatenated",
-                              gradeable, "provided_code", "provided_code")
-    output_file = os.path.join(output_dir, "submission.concatenated")
-
-    if os.path.isdir(provided_code_path) and len(os.listdir(provided_code_path)) != 0:
-        # If the directory already exists, delete it and make a new one
-        if os.path.isdir(output_dir):
-            shutil.rmtree(output_dir)
-        os.makedirs(output_dir)
-
-        with open(output_file, 'w') as of:
-            # Loop over all of the provided files and concatenate them
-            for file in sorted(os.listdir(provided_code_path)):
-                with open(os.path.join(provided_code_path, file), encoding='ISO-8859-1') as tmp:
-                    # append the contents of the file
-                    of.write(tmp.read())
+    # concatenate provided code
+    with open(os.path.join(args.basepath, "provided_code",
+                           "submission.concatenated"), "w") as file:
+        file.write(getConcatFilesInDir(os.path.join(args.basepath, "provided_code", "files")), [])
 
     # ==========================================================================
-    print("done")
-    print(f"{count_total_files} files concatenated")
+    end_time = time.time()
+    print("done in " + str(end_time - start_time) + " seconds")
 
 
 if __name__ == "__main__":

From 388ff4ebe7a77bda88c9959352fe76327f5f97eb Mon Sep 17 00:00:00 2001
From: sbelsk <shellybelsky02@gmail.com>
Date: Mon, 28 Jun 2021 13:02:33 -0400
Subject: [PATCH 05/52] Fix python errors

---
 bin/concatenate_all.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
index eab7aa5..783e1ef 100644
--- a/bin/concatenate_all.py
+++ b/bin/concatenate_all.py
@@ -22,7 +22,7 @@ def getConcatFilesInDir(input_dir, regex_patterns):
     for my_dir, _dirs, my_files in os.walk(input_dir):
         # Determine if regex should be used (blank regex is equivalent to selecting all files)
         files = sorted(my_files)
-        if regex_expressions[0] != "":
+        if regex_patterns[0] != "":
             files_filtered = []
             for e in regex_patterns:
                 files_filtered.extend(fnmatch.filter(files, e.strip()))
@@ -88,7 +88,7 @@ def main():
     # loop through and concatenate the selected files for each user in this gradeable
 
     for dir in regex_dirs:
-        gradeable_path = os.path.join(datapath, semester, course, dir, gradeable)
+        gradeable_path = os.path.join(args.datapath, semester, course, dir, gradeable)
         # loop over each user
         for user in sorted(os.listdir(gradeable_path)):
             user_path = os.path.join(gradeable_path, user)
@@ -114,7 +114,6 @@ def main():
                     concatenated_contents = getConcatFilesInDir(version_path, regex_patterns)
                     output_file.write(concatenated_contents)
 
-
     # ==========================================================================
     # iterate over all of the created submissions, checking to see if they are empty
     # and adding a message to the top if so (to differentiate empty files from errors in the UI)

From 053f0f6a503161a73867cc36835246c795bd8b0c Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Tue, 29 Jun 2021 10:29:26 -0400
Subject: [PATCH 06/52] Progress: everything through tokenization finished

---
 bin/concatenate_all.py            |  4 +-
 bin/process_all.sh                | 18 +++++----
 bin/tokenize_all.py               | 67 ++++++++++---------------------
 compare_hashes/compare_hashes.cpp |  2 +-
 4 files changed, 35 insertions(+), 56 deletions(-)

diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
index 783e1ef..b289d21 100644
--- a/bin/concatenate_all.py
+++ b/bin/concatenate_all.py
@@ -103,7 +103,7 @@ def main():
                 if not os.path.isdir(version_path):
                     continue
 
-                output_file_path = os.path.join(args.basepath, user,
+                output_file_path = os.path.join(args.basepath, "users", user,
                                                 version, "submission.concatenated")
 
                 if not os.path.exists(os.path.dirname(output_file_path)):
@@ -130,7 +130,7 @@ def main():
     # concatenate provided code
     with open(os.path.join(args.basepath, "provided_code",
                            "submission.concatenated"), "w") as file:
-        file.write(getConcatFilesInDir(os.path.join(args.basepath, "provided_code", "files")), [])
+        file.write(getConcatFilesInDir(os.path.join(args.basepath, "provided_code", "files"), regex_patterns))
 
     # ==========================================================================
     end_time = time.time()
diff --git a/bin/process_all.sh b/bin/process_all.sh
index 24629ae..d523d99 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -6,14 +6,16 @@
 # TODO: Assert permissions, as necessary
 
 basepath=$1 # holds the path to a directory containing a config for this gradeable
+            # (probably .../lichen/gradeable/<unique number>/ on Submitty)
+
 datapath=$2 # holds the path to a directory conatining courses and their data
             # (probably /var/local/submitty/courses on Submitty)
 
 # kill the script if there is no config file
-if [ ! -f "${basepath}/config.json" ]; then
-    echo "Unable to find config.json in provided directory"
-		exit 1
-fi
+# if [ ! -f "${basepath}/config.json" ]; then
+#     echo "Unable to find config.json in provided directory"
+# 		exit 1
+# fi
 
 # create these directories if they don't already exist
 mkdir -p "${basepath}/logs"
@@ -23,7 +25,7 @@ mkdir -p "${basepath}/other_gradeables"
 mkdir -p "${basepath}/users"
 
 # run all of the modules and exit if an error occurs
-/usr/local/submitty/Lichen/bin/concatenate_all.py  $basepath $datapath || exit 1
-#/usr/local/submitty/Lichen/bin/tokenize_all.py     $basepath || exit 1
-#/usr/local/submitty/Lichen/bin/hash_all.py         $basepath || exit 1
-#/usr/local/submitty/Lichen/bin/compare_hashes.out  $basepath || exit 1
+./concatenate_all.py  "${basepath}" "${datapath}" || exit 1
+./tokenize_all.py     $basepath || exit 1
+#hash_all.py         $basepath || exit 1
+#compare_hashes.out  $basepath || exit 1
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
index 632d4ef..8a74591 100644
--- a/bin/tokenize_all.py
+++ b/bin/tokenize_all.py
@@ -9,28 +9,18 @@
 import sys
 
 
-CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
-with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
-    OPEN_JSON = json.load(open_file)
-SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
-SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
-
-
 def parse_args():
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("config_path")
+    parser.add_argument("basepath")
     return parser.parse_args()
 
 
-def tokenize(args, my_concatenated_file, my_tokenized_file):
-
-    with open(args.config_path) as lichen_config:
-        lichen_config_data = json.load(lichen_config)
-        language = lichen_config_data["language"]
+def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
+    language = lichen_config_data["language"]
 
     language_token_data = dict()
 
-    data_json_path = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin", "data.json")
+    data_json_path = "./data.json"  # data.json is in the Lichen/bin directory after install
     with open(data_json_path, 'r') as token_data_file:
         token_data = json.load(token_data_file)
         if language not in token_data:
@@ -39,8 +29,7 @@ def tokenize(args, my_concatenated_file, my_tokenized_file):
         else:
             language_token_data = token_data[language]
 
-    tokenizer = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin",
-                             language_token_data["tokenizer"])
+    tokenizer = f"./{language_token_data['tokenizer']}"
 
     if not language_token_data.get("input_as_argument"):
         my_concatenated_file = f'< {my_concatenated_file}'
@@ -58,39 +47,27 @@ def main():
     sys.stdout.write("TOKENIZE ALL...")
     sys.stdout.flush()
 
-    with open(args.config_path) as lichen_config:
+    with open(os.path.join(args.basepath, "config.json")) as lichen_config:
         lichen_config_data = json.load(lichen_config)
-        semester = lichen_config_data["semester"]
-        course = lichen_config_data["course"]
-        gradeable = lichen_config_data["gradeable"]
-
-    # ===========================================================================
-    # error checking
-    course_dir = os.path.join(SUBMITTY_DATA_DIR, "courses", semester, course)
-    if not os.path.isdir(course_dir):
-        print("ERROR! ", course_dir, " is not a valid course directory")
-        exit(1)
-    concatenated_dir = os.path.join(course_dir, "lichen", "concatenated", gradeable)
-    if not os.path.isdir(concatenated_dir):
-        print("ERROR! ", concatenated_dir, " is not a valid gradeable concatenated directory")
-        exit(1)
-
-    tokenized_dir = os.path.join(course_dir, "lichen", "tokenized", gradeable)
 
     # ===========================================================================
     # walk the subdirectories
-    for user in sorted(os.listdir(concatenated_dir)):
-        for version in sorted(os.listdir(os.path.join(concatenated_dir, user))):
-            my_concatenated_file = os.path.join(concatenated_dir, user, version,
-                                                "submission.concatenated")
-
-            # ==================================================================
-            # create the directory
-            my_tokenized_dir = os.path.join(tokenized_dir, user, version)
-            if not os.path.isdir(my_tokenized_dir):
-                os.makedirs(my_tokenized_dir)
-            my_tokenized_file = os.path.join(my_tokenized_dir, "tokens.json")
-            tokenize(args, my_concatenated_file, my_tokenized_file)
+    users_dir = os.path.join(args.basepath, "users")
+    for user in sorted(os.listdir(users_dir)):
+        user_dir = os.path.join(users_dir, user)
+        if not os.path.isdir(user_dir):
+            continue
+
+        for version in sorted(os.listdir(user_dir)):
+            my_dir = os.path.join(user_dir, version)
+            if not os.path.isdir(my_dir):
+                continue
+
+            print(my_dir)
+
+            my_concatenated_file = os.path.join(my_dir, "submission.concatenated")
+            my_tokenized_file = os.path.join(my_dir, "tokens.json")
+            tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file)
 
     print("done")
 
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
index b283e88..bf7b813 100644
--- a/compare_hashes/compare_hashes.cpp
+++ b/compare_hashes/compare_hashes.cpp
@@ -628,6 +628,6 @@ int main(int argc, char* argv[]) {
   // ---------------------------------------------------------------------------
   time(&overall_end);
   double overall_diff = difftime(overall_end, overall_start);
-  std::cout << "DONE in " << overall_diff << "s" << std::endl;
+  std::cout << "done in " << overall_diff << "s" << std::endl;
 
 }

From 9480b4759b5c5e68f71ed8241920b10f1b316313 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Tue, 29 Jun 2021 10:52:08 -0400
Subject: [PATCH 07/52] Everything works

---
 bin/hash_all.py     | 65 ++++++++++++++++-----------------------------
 bin/process_all.sh  |  4 +--
 bin/tokenize_all.py |  6 +++--
 3 files changed, 29 insertions(+), 46 deletions(-)

diff --git a/bin/hash_all.py b/bin/hash_all.py
index 3128cd8..1c5dac5 100644
--- a/bin/hash_all.py
+++ b/bin/hash_all.py
@@ -11,27 +11,18 @@
 import sys
 import hashlib
 
-CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
-with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
-    OPEN_JSON = json.load(open_file)
-SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
-SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
-
 
 def parse_args():
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("config_path")
-    args = parser.parse_args()
-    return args
+    parser.add_argument("basepath")
+    return parser.parse_args()
 
 
-def hasher(args, my_tokenized_file, my_hashes_file):
-    with open(args.config_path) as lichen_config:
-        lichen_config_data = json.load(lichen_config)
-        language = lichen_config_data["language"]
-        sequence_length = int(lichen_config_data["sequence_length"])
+def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
+    language = lichen_config_data["language"]
+    sequence_length = int(lichen_config_data["sequence_length"])
 
-    data_json_path = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin", "data.json")
+    data_json_path = "./data.json"  # data.json is in the Lichen/bin directory after install
     with open(data_json_path) as token_data_file:
         token_data = json.load(token_data_file)
         if language not in token_data:
@@ -59,42 +50,32 @@ def hasher(args, my_tokenized_file, my_hashes_file):
 def main():
     args = parse_args()
 
-    with open(args.config_path) as lichen_config:
+    with open(os.path.join(args.basepath, "config.json")) as lichen_config:
         lichen_config_data = json.load(lichen_config)
-        semester = lichen_config_data["semester"]
-        course = lichen_config_data["course"]
-        gradeable = lichen_config_data["gradeable"]
 
     sys.stdout.write("HASH ALL...")
     sys.stdout.flush()
 
     # =========================================================================
-    # error checking
-    course_dir = os.path.join(SUBMITTY_DATA_DIR, "courses", semester, course)
-    if not os.path.isdir(course_dir):
-        print("ERROR! ", course_dir, " is not a valid course directory")
-        exit(1)
-    tokenized_dir = os.path.join(course_dir, "lichen", "tokenized", gradeable)
-    if not os.path.isdir(tokenized_dir):
-        print("ERROR! ", tokenized_dir, " is not a valid gradeable tokenized directory")
+    # walk the subdirectories
+    users_dir = os.path.join(args.basepath, "users")
+    if not os.path.isdir(users_dir):
+        print("Error: Unable to find users directory")
         exit(1)
 
-    hashes_dir = os.path.join(course_dir, "lichen", "hashes", gradeable)
+    for user in sorted(os.listdir(users_dir)):
+        user_dir = os.path.join(users_dir, user)
+        if not os.path.isdir(user_dir):
+            continue
 
-    # =========================================================================
-    # walk the subdirectories
-    for user in sorted(os.listdir(tokenized_dir)):
-        for version in sorted(os.listdir(os.path.join(tokenized_dir, user))):
-            my_tokenized_file = os.path.join(tokenized_dir, user, version, "tokens.json")
-
-            # =================================================================
-            # create the directory
-            my_hashes_dir = os.path.join(hashes_dir, user, version)
-            if not os.path.isdir(my_hashes_dir):
-                os.makedirs(my_hashes_dir)
-
-            my_hashes_file = os.path.join(my_hashes_dir, "hashes.txt")
-            hasher(args, my_tokenized_file, my_hashes_file)
+        for version in sorted(os.listdir(user_dir)):
+            my_dir = os.path.join(user_dir, version)
+            if not os.path.isdir(my_dir):
+                continue
+
+            my_tokenized_file = os.path.join(my_dir, "tokens.json")
+            my_hashes_file = os.path.join(my_dir, "hashes.txt")
+            hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
 
     print("done")
 
diff --git a/bin/process_all.sh b/bin/process_all.sh
index d523d99..d7cbb7e 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -27,5 +27,5 @@ mkdir -p "${basepath}/users"
 # run all of the modules and exit if an error occurs
 ./concatenate_all.py  "${basepath}" "${datapath}" || exit 1
 ./tokenize_all.py     $basepath || exit 1
-#hash_all.py         $basepath || exit 1
-#compare_hashes.out  $basepath || exit 1
+./hash_all.py         $basepath || exit 1
+./compare_hashes.out  $basepath || exit 1
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
index 8a74591..00eb100 100644
--- a/bin/tokenize_all.py
+++ b/bin/tokenize_all.py
@@ -53,6 +53,10 @@ def main():
     # ===========================================================================
     # walk the subdirectories
     users_dir = os.path.join(args.basepath, "users")
+    if not os.path.isdir(users_dir):
+        print("Error: Unable to find users directory")
+        exit(1)
+
     for user in sorted(os.listdir(users_dir)):
         user_dir = os.path.join(users_dir, user)
         if not os.path.isdir(user_dir):
@@ -63,8 +67,6 @@ def main():
             if not os.path.isdir(my_dir):
                 continue
 
-            print(my_dir)
-
             my_concatenated_file = os.path.join(my_dir, "submission.concatenated")
             my_tokenized_file = os.path.join(my_dir, "tokens.json")
             tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file)

From 0938e76b26f53f516e6f7a2a6c4a871ffbeabf09 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Tue, 29 Jun 2021 11:19:27 -0400
Subject: [PATCH 08/52] Add timers

---
 bin/concatenate_all.py            |  2 +-
 bin/hash_all.py                   |  6 +++++-
 bin/process_all.sh                | 14 ++++++++++----
 bin/tokenize_all.py               |  6 +++++-
 compare_hashes/compare_hashes.cpp | 10 +++++-----
 5 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
index b289d21..d140d58 100644
--- a/bin/concatenate_all.py
+++ b/bin/concatenate_all.py
@@ -134,7 +134,7 @@ def main():
 
     # ==========================================================================
     end_time = time.time()
-    print("done in " + str(end_time - start_time) + " seconds")
+    print("done in " + "%.0f" % (end_time - start_time) + " seconds")
 
 
 if __name__ == "__main__":
diff --git a/bin/hash_all.py b/bin/hash_all.py
index 1c5dac5..24012d4 100644
--- a/bin/hash_all.py
+++ b/bin/hash_all.py
@@ -8,6 +8,7 @@
 import argparse
 import os
 import json
+import time
 import sys
 import hashlib
 
@@ -48,6 +49,7 @@ def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
 
 
 def main():
+    start_time = time.time()
     args = parse_args()
 
     with open(os.path.join(args.basepath, "config.json")) as lichen_config:
@@ -77,7 +79,9 @@ def main():
             my_hashes_file = os.path.join(my_dir, "hashes.txt")
             hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
 
-    print("done")
+    # ==========================================================================
+    end_time = time.time()
+    print("done in " + "%.0f" % (end_time - start_time) + " seconds")
 
 
 if __name__ == "__main__":
diff --git a/bin/process_all.sh b/bin/process_all.sh
index d7cbb7e..ce74cf9 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -12,10 +12,16 @@ datapath=$2 # holds the path to a directory conatining courses and their data
             # (probably /var/local/submitty/courses on Submitty)
 
 # kill the script if there is no config file
-# if [ ! -f "${basepath}/config.json" ]; then
-#     echo "Unable to find config.json in provided directory"
-# 		exit 1
-# fi
+if [ ! -f "${basepath}/config.json" ]; then
+    echo "Unable to find config.json in provided directory"
+		exit 1
+fi
+
+# delete any previous run results
+# TODO: determine if any caching should occur
+rm -rf "${basepath}/logs"
+rm -rf "${basepath}/other_gradeables"
+rm -rf "${basepath}/users"
 
 # create these directories if they don't already exist
 mkdir -p "${basepath}/logs"
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
index 00eb100..d00b3cf 100644
--- a/bin/tokenize_all.py
+++ b/bin/tokenize_all.py
@@ -6,6 +6,7 @@
 import argparse
 import os
 import json
+import time
 import sys
 
 
@@ -42,6 +43,7 @@ def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
 
 
 def main():
+    start_time = time.time()
     args = parse_args()
 
     sys.stdout.write("TOKENIZE ALL...")
@@ -71,7 +73,9 @@ def main():
             my_tokenized_file = os.path.join(my_dir, "tokens.json")
             tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file)
 
-    print("done")
+    # ==========================================================================
+    end_time = time.time()
+    print("done in " + "%.0f" % (end_time - start_time) + " seconds")
 
 
 if __name__ == "__main__":
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
index bf7b813..b362069 100644
--- a/compare_hashes/compare_hashes.cpp
+++ b/compare_hashes/compare_hashes.cpp
@@ -257,7 +257,7 @@ int main(int argc, char* argv[]) {
 
   time(&end);
   double diff = difftime(end, start);
-  std::cout << "finished loading in " << diff  << "s" << std::endl;
+  std::cout << "finished loading in " << diff  << " seconds" << std::endl;
 
   // ---------------------------------------------------------------------------
   // THIS IS THE MAIN PLAGIARISM DETECTION ALGORITHM
@@ -325,7 +325,7 @@ int main(int argc, char* argv[]) {
 
   time(&end);
   diff = difftime(end, start);
-  std::cout << "finished walking in " << diff << "s" << std::endl;
+  std::cout << "finished walking in " << diff << " seconds" << std::endl;
 
   // ---------------------------------------------------------------------------
   // Writing the output files and merging the results
@@ -514,7 +514,7 @@ int main(int argc, char* argv[]) {
 
   time(&end);
   diff = difftime(end, start);
-  std::cout << "done merging and writing matches files in " << diff << "s" << std::endl;
+  std::cout << "done merging and writing matches files in " << diff << " seconds" << std::endl;
 
   // ---------------------------------------------------------------------------
   // Create a general summary of rankings of users by percentage match
@@ -623,11 +623,11 @@ int main(int argc, char* argv[]) {
 
   time(&end);
   diff = difftime(end, start);
-  std::cout << "finished writing rankings in " << diff << "s" << std::endl;
+  std::cout << "finished writing rankings in " << diff << " seconds" << std::endl;
 
   // ---------------------------------------------------------------------------
   time(&overall_end);
   double overall_diff = difftime(overall_end, overall_start);
-  std::cout << "done in " << overall_diff << "s" << std::endl;
+  std::cout << "done in " << overall_diff << " seconds" << std::endl;
 
 }

From 563642a857a49ec200c59cccc15e955ffea7dbb5 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Tue, 29 Jun 2021 12:22:37 -0400
Subject: [PATCH 09/52] remove unnecessary code

---
 bin/process_all.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/process_all.sh b/bin/process_all.sh
index ce74cf9..a6479a7 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -31,7 +31,7 @@ mkdir -p "${basepath}/other_gradeables"
 mkdir -p "${basepath}/users"
 
 # run all of the modules and exit if an error occurs
-./concatenate_all.py  "${basepath}" "${datapath}" || exit 1
+./concatenate_all.py  $basepath $datapath || exit 1
 ./tokenize_all.py     $basepath || exit 1
 ./hash_all.py         $basepath || exit 1
 ./compare_hashes.out  $basepath || exit 1

From 210a778b03086916c8a37df30b6d7aa163f46797 Mon Sep 17 00:00:00 2001
From: sbelsk <shellybelsky02@gmail.com>
Date: Wed, 30 Jun 2021 08:23:54 -0400
Subject: [PATCH 10/52] little python changes

---
 bin/concatenate_all.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
index d140d58..52403da 100644
--- a/bin/concatenate_all.py
+++ b/bin/concatenate_all.py
@@ -55,7 +55,7 @@ def main():
     sys.stdout.write("CONCATENATE ALL...")  # don't want a newline here so can't use print
     sys.stdout.flush()
 
-    config_path = args.basepath + '/config.json'
+    config_path = os.path.join(args.basepath, "config.json")
     if not os.path.isfile(config_path):
         print(f"Error: invalid config path provided ({config_path})")
         exit(1)
@@ -130,7 +130,8 @@ def main():
     # concatenate provided code
     with open(os.path.join(args.basepath, "provided_code",
                            "submission.concatenated"), "w") as file:
-        file.write(getConcatFilesInDir(os.path.join(args.basepath, "provided_code", "files"), regex_patterns))
+        provided_code_files = os.path.join(args.basepath, "provided_code", "files")
+        file.write(getConcatFilesInDir(provided_code_files, regex_patterns))
 
     # ==========================================================================
     end_time = time.time()

From 473ff7bd215f91c11ffeb1ac1e30bea3c8b71605 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Wed, 30 Jun 2021 08:31:43 -0400
Subject: [PATCH 11/52] William made an oopsie (forgot to deal with provided
 code)

---
 bin/hash_all.py     | 6 ++++++
 bin/process_all.sh  | 4 ++++
 bin/tokenize_all.py | 6 ++++++
 3 files changed, 16 insertions(+)

diff --git a/bin/hash_all.py b/bin/hash_all.py
index 24012d4..2e6e544 100644
--- a/bin/hash_all.py
+++ b/bin/hash_all.py
@@ -79,6 +79,12 @@ def main():
             my_hashes_file = os.path.join(my_dir, "hashes.txt")
             hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
 
+    # ===========================================================================
+    # hash the provided code
+    provided_code_tokenized = os.path.join(args.basepath, "provided_code", "tokens.json")
+    provided_code_hashed = os.path.join(args.basepath, "provided_code", "hashes.txt")
+    hasher(lichen_config_data, provided_code_tokenized, provided_code_hashed)
+
     # ==========================================================================
     end_time = time.time()
     print("done in " + "%.0f" % (end_time - start_time) + " seconds")
diff --git a/bin/process_all.sh b/bin/process_all.sh
index a6479a7..2e01f5b 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -22,6 +22,10 @@ fi
 rm -rf "${basepath}/logs"
 rm -rf "${basepath}/other_gradeables"
 rm -rf "${basepath}/users"
+rm "${basepath}/overall_ranking.txt"
+rm "${basepath}/provided_code/submission.concatenated"
+rm "${basepath}/provided_code/tokens.json"
+rm "${basepath}/provided_code/hashes.txt"
 
 # create these directories if they don't already exist
 mkdir -p "${basepath}/logs"
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
index d00b3cf..6dac19f 100644
--- a/bin/tokenize_all.py
+++ b/bin/tokenize_all.py
@@ -73,6 +73,12 @@ def main():
             my_tokenized_file = os.path.join(my_dir, "tokens.json")
             tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file)
 
+    # ===========================================================================
+    # tokenize the provided code
+    provided_code_concat = os.path.join(args.basepath, "provided_code", "submission.concatenated")
+    provided_code_tokenized = os.path.join(args.basepath, "provided_code", "tokens.json")
+    tokenize(lichen_config_data, provided_code_concat, provided_code_tokenized)
+
     # ==========================================================================
     end_time = time.time()
     print("done in " + "%.0f" % (end_time - start_time) + " seconds")

From 00675a3917226263a9a27629a86a8e4e1181fffb Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Thu, 1 Jul 2021 16:17:55 -0400
Subject: [PATCH 12/52] Fix minor bugs

Fix process_all.sh script plus fix spelling issue and prevent hash_all.py from breaking when empty tokenized files are written
---
 bin/hash_all.py                   | 19 +++++++++++--------
 bin/process_all.sh                | 23 +++++++++++++++--------
 compare_hashes/compare_hashes.cpp |  2 +-
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/bin/hash_all.py b/bin/hash_all.py
index 2e6e544..571cdf4 100644
--- a/bin/hash_all.py
+++ b/bin/hash_all.py
@@ -37,15 +37,17 @@ def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
     with open(my_tokenized_file, 'r', encoding='ISO-8859-1') as my_tf:
         with open(my_hashes_file, 'w') as my_hf:
             tokens = json.load(my_tf)
-            token_values = [str(x.get(token_data[language]["token_value"]))
-                            for x in tokens]
-            num = len(tokens)
-            # FIXME: this truncation should be adjusted after testing
-            token_hashed_values = [(hashlib.md5(''.join(
-                token_values[x:x+sequence_length]).encode())
-                .hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
+            # write empty hashes file if the tokens file was empty (such as
+            # when there is no provided code)
+            if tokens is not None:
+                token_values = [str(x[token_data[language]["token_value"]]) for x in tokens]
+                num = len(tokens)
+                # FIXME: this truncation should be adjusted after testing
+                token_hashed_values = [(hashlib.md5(''.join(
+                    token_values[x:x+sequence_length]).encode())
+                    .hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
 
-            my_hf.write('\n'.join(token_hashed_values))
+                my_hf.write('\n'.join(token_hashed_values))
 
 
 def main():
@@ -75,6 +77,7 @@ def main():
             if not os.path.isdir(my_dir):
                 continue
 
+            print(my_dir)
             my_tokenized_file = os.path.join(my_dir, "tokens.json")
             my_hashes_file = os.path.join(my_dir, "hashes.txt")
             hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
diff --git a/bin/process_all.sh b/bin/process_all.sh
index 2e01f5b..1a08c4d 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -1,3 +1,5 @@
+#!/bin/sh
+
 # This script is the startup script for Lichen.  It accepts a single path to a
 # directory containing a config file and creates the necessary output directories
 # as appropriate, relative to the provided path.  It is possible to run this script
@@ -22,10 +24,10 @@ fi
 rm -rf "${basepath}/logs"
 rm -rf "${basepath}/other_gradeables"
 rm -rf "${basepath}/users"
-rm "${basepath}/overall_ranking.txt"
-rm "${basepath}/provided_code/submission.concatenated"
-rm "${basepath}/provided_code/tokens.json"
-rm "${basepath}/provided_code/hashes.txt"
+rm -f "${basepath}/overall_ranking.txt"
+rm -f "${basepath}/provided_code/submission.concatenated"
+rm -f "${basepath}/provided_code/tokens.json"
+rm -f "${basepath}/provided_code/hashes.txt"
 
 # create these directories if they don't already exist
 mkdir -p "${basepath}/logs"
@@ -34,8 +36,13 @@ mkdir -p "${basepath}/provided_code/files"
 mkdir -p "${basepath}/other_gradeables"
 mkdir -p "${basepath}/users"
 
+log_file="${basepath}/logs/lichen_job_output.txt"
+
+cd $(dirname "${0}")
+
 # run all of the modules and exit if an error occurs
-./concatenate_all.py  $basepath $datapath || exit 1
-./tokenize_all.py     $basepath || exit 1
-./hash_all.py         $basepath || exit 1
-./compare_hashes.out  $basepath || exit 1
+echo "Beginning Lichen run: $(date +"%Y-%m-%d %H:%M:%S")" >> $log_file 2>&1
+./concatenate_all.py  $basepath $datapath >> $log_file 2>&1 || exit 1
+./tokenize_all.py     $basepath           >> $log_file 2>&1 || exit 1
+./hash_all.py         $basepath           >> $log_file 2>&1 || exit 1
+./compare_hashes.out  $basepath           >> $log_file 2>&1 || exit 1
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
index b362069..ab9fa2c 100644
--- a/compare_hashes/compare_hashes.cpp
+++ b/compare_hashes/compare_hashes.cpp
@@ -518,7 +518,7 @@ int main(int argc, char* argv[]) {
 
   // ---------------------------------------------------------------------------
   // Create a general summary of rankings of users by percentage match
-  std::cout << "writing rakings files..." << std::endl;
+  std::cout << "writing rankings files..." << std::endl;
   time(&start);
 
   // create a single file of students ranked by highest percentage of code plagiarised

From 8a5db9d21c72d7c9578dcd14131491c2740a5e3a Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Fri, 2 Jul 2021 16:13:40 -0400
Subject: [PATCH 13/52] Fix permissions issue with provided code editing

---
 bin/process_all.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bin/process_all.sh b/bin/process_all.sh
index 1a08c4d..6b958bf 100644
--- a/bin/process_all.sh
+++ b/bin/process_all.sh
@@ -36,6 +36,9 @@ mkdir -p "${basepath}/provided_code/files"
 mkdir -p "${basepath}/other_gradeables"
 mkdir -p "${basepath}/users"
 
+# the default is r-x and we need PHP to be able to write if edits are made to the provided code
+chmod g=rwxs "${basepath}/provided_code/files"
+
 log_file="${basepath}/logs/lichen_job_output.txt"
 
 cd $(dirname "${0}")

From f7abb099c6ad81c62c145ebdb86995c5bd09beff Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 09:44:29 -0400
Subject: [PATCH 14/52] Add initial script

---
 .github/workflows/lichen_run.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .github/workflows/lichen_run.yml

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
new file mode 100644
index 0000000..6084780
--- /dev/null
+++ b/.github/workflows/lichen_run.yml
@@ -0,0 +1,17 @@
+name: Test Lichen
+
+on: [push]
+
+jobs:
+  python-lint:
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.6'
+      - name: Create Directory Structure
+        run: |
+          mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
+          mkdir -p /usr/local/submitty/Lichen/
+          ls

From 3ba16d23a0b7389e54c97a10ef76b260fbf86828 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 09:46:39 -0400
Subject: [PATCH 15/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 6084780..2459f2b 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -3,7 +3,7 @@ name: Test Lichen
 on: [push]
 
 jobs:
-  python-lint:
+  Test Lichen:
     runs-on: ubuntu-18.04
     steps:
       - uses: actions/checkout@v2
@@ -12,6 +12,6 @@ jobs:
           python-version: '3.6'
       - name: Create Directory Structure
         run: |
-          mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
-          mkdir -p /usr/local/submitty/Lichen/
+          sudo mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
+          sudo mkdir -p /usr/local/submitty/Lichen/
           ls

From e0ac0dac9bcbe03c6acd80c54b227091034043e9 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 09:47:30 -0400
Subject: [PATCH 16/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 2459f2b..8086f63 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -3,7 +3,7 @@ name: Test Lichen
 on: [push]
 
 jobs:
-  Test Lichen:
+  test-lichen:
     runs-on: ubuntu-18.04
     steps:
       - uses: actions/checkout@v2

From ddbd29f7b66397ee11389c23b4a2dd573dcf27a6 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 09:51:12 -0400
Subject: [PATCH 17/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 8086f63..22ffe60 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -14,4 +14,6 @@ jobs:
         run: |
           sudo mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
           sudo mkdir -p /usr/local/submitty/Lichen/
-          ls
+          cp * /usr/local/submitty/GIT_CHECKOUT/Lichen/
+          bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
+          ls /usr/local/submitty/Lichen/

From 6c3aadd1f2f49c2d49c029643aaff76f9513bb9f Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 09:52:22 -0400
Subject: [PATCH 18/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 22ffe60..2fd50f7 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -14,6 +14,6 @@ jobs:
         run: |
           sudo mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
           sudo mkdir -p /usr/local/submitty/Lichen/
-          cp * /usr/local/submitty/GIT_CHECKOUT/Lichen/
-          bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
-          ls /usr/local/submitty/Lichen/
+          sudo cp * /usr/local/submitty/GIT_CHECKOUT/Lichen/
+          sudo bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
+          sudo ls /usr/local/submitty/Lichen/

From d644fde6aa7bbe742c81d89375d2809a3cd108ab Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 09:53:21 -0400
Subject: [PATCH 19/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 2fd50f7..c79a432 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -14,6 +14,6 @@ jobs:
         run: |
           sudo mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
           sudo mkdir -p /usr/local/submitty/Lichen/
-          sudo cp * /usr/local/submitty/GIT_CHECKOUT/Lichen/
+          sudo cp -r * /usr/local/submitty/GIT_CHECKOUT/Lichen/
           sudo bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
           sudo ls /usr/local/submitty/Lichen/

From 67566b0020135da1604ba2f7c85daf33fdba5dfe Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 09:56:02 -0400
Subject: [PATCH 20/52] add boost

---
 .github/workflows/lichen_run.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index c79a432..7e2ea2b 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -10,6 +10,9 @@ jobs:
       - uses: actions/setup-python@v2
         with:
           python-version: '3.6'
+      - name: Install Dependencies
+        run: |
+          sudo apt install libboost-all-dev
       - name: Create Directory Structure
         run: |
           sudo mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/

From 32ff9869af4217bdf0da69aec840dfa366586e90 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 10:03:49 -0400
Subject: [PATCH 21/52] add testing file

---
 .github/workflows/lichen_run.yml |  4 +++-
 tests/tests.py                   | 11 +++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 tests/tests.py

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 7e2ea2b..f2e48d6 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -19,4 +19,6 @@ jobs:
           sudo mkdir -p /usr/local/submitty/Lichen/
           sudo cp -r * /usr/local/submitty/GIT_CHECKOUT/Lichen/
           sudo bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
-          sudo ls /usr/local/submitty/Lichen/
+      - name: Run Tests
+        run: |
+          python3 ./tests.py
diff --git a/tests/tests.py b/tests/tests.py
new file mode 100644
index 0000000..e615ac7
--- /dev/null
+++ b/tests/tests.py
@@ -0,0 +1,11 @@
+import unittest
+
+
+class TestTokenizers(unittest.TestCase):
+
+    def test(self):
+        print('test!')
+
+
+if __name__ == '__main__':
+    unittest.main()

From be1a57fa28b7252e127854227b380cee06e0c625 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 10:05:03 -0400
Subject: [PATCH 22/52] forgot that paths are important

---
 .github/workflows/lichen_run.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index f2e48d6..9182126 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -21,4 +21,4 @@ jobs:
           sudo bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
       - name: Run Tests
         run: |
-          python3 ./tests.py
+          python3 ./tests/tests.py

From 4b01ead4c13d6790afedeadcc8b39e0d607d593a Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 10:16:39 -0400
Subject: [PATCH 23/52] Make separate setup.sh script

---
 .github/workflows/lichen_run.yml | 5 +----
 tests/setup.sh                   | 8 ++++++++
 tests/tests.py                   | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)
 create mode 100644 tests/setup.sh

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 9182126..ca8bdf1 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -15,10 +15,7 @@ jobs:
           sudo apt install libboost-all-dev
       - name: Create Directory Structure
         run: |
-          sudo mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
-          sudo mkdir -p /usr/local/submitty/Lichen/
-          sudo cp -r * /usr/local/submitty/GIT_CHECKOUT/Lichen/
-          sudo bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
+          sudo ./tests/setup.sh
       - name: Run Tests
         run: |
           python3 ./tests/tests.py
diff --git a/tests/setup.sh b/tests/setup.sh
new file mode 100644
index 0000000..b226963
--- /dev/null
+++ b/tests/setup.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
+cp -r * /usr/local/submitty/GIT_CHECKOUT/Lichen/
+
+mkdir -p /usr/local/submitty/Lichen/
+
+bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
diff --git a/tests/tests.py b/tests/tests.py
index e615ac7..fa41488 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -3,7 +3,7 @@
 
 class TestTokenizers(unittest.TestCase):
 
-    def test(self):
+    def testPlaintextTokenizer(self):
         print('test!')
 
 

From 2b15816ec88633055e8b204d5c4829b899bfc601 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 10:18:17 -0400
Subject: [PATCH 24/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index ca8bdf1..e8093ca 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -15,7 +15,7 @@ jobs:
           sudo apt install libboost-all-dev
       - name: Create Directory Structure
         run: |
-          sudo ./tests/setup.sh
+          sudo bash ./tests/setup.sh
       - name: Run Tests
         run: |
           python3 ./tests/tests.py

From 0595eb396d3dd415227abd9f2ff0063069aaba93 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:31:07 -0400
Subject: [PATCH 25/52] Adjust file structure, add setup script

---
 .flake8                                       |  1 +
 .../plaintext/expected_output/output.json     |  0
 .../output_ignore_everything.json             |  0
 .../output_ignore_newlines.json               |  0
 .../output_ignore_punctuation.json            |  0
 .../expected_output/output_to_lower.json      |  0
 .../data/tokenizer}/plaintext/input.txt       |  0
 tests/setup.sh                                | 23 +++++++++++++++----
 tests/tests.py                                | 18 +++++++++++++--
 9 files changed, 36 insertions(+), 6 deletions(-)
 rename {tokenizer => tests/data/tokenizer}/plaintext/expected_output/output.json (100%)
 rename {tokenizer => tests/data/tokenizer}/plaintext/expected_output/output_ignore_everything.json (100%)
 rename {tokenizer => tests/data/tokenizer}/plaintext/expected_output/output_ignore_newlines.json (100%)
 rename {tokenizer => tests/data/tokenizer}/plaintext/expected_output/output_ignore_punctuation.json (100%)
 rename {tokenizer => tests/data/tokenizer}/plaintext/expected_output/output_to_lower.json (100%)
 rename {tokenizer => tests/data/tokenizer}/plaintext/input.txt (100%)

diff --git a/.flake8 b/.flake8
index 5d69a55..33a5336 100644
--- a/.flake8
+++ b/.flake8
@@ -5,3 +5,4 @@ exclude=
 
 per-file-ignores =
     tokenizer/mips/mips_tokenizer.py:W605
+    tests/tests.py:E501
diff --git a/tokenizer/plaintext/expected_output/output.json b/tests/data/tokenizer/plaintext/expected_output/output.json
similarity index 100%
rename from tokenizer/plaintext/expected_output/output.json
rename to tests/data/tokenizer/plaintext/expected_output/output.json
diff --git a/tokenizer/plaintext/expected_output/output_ignore_everything.json b/tests/data/tokenizer/plaintext/expected_output/output_ignore_everything.json
similarity index 100%
rename from tokenizer/plaintext/expected_output/output_ignore_everything.json
rename to tests/data/tokenizer/plaintext/expected_output/output_ignore_everything.json
diff --git a/tokenizer/plaintext/expected_output/output_ignore_newlines.json b/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json
similarity index 100%
rename from tokenizer/plaintext/expected_output/output_ignore_newlines.json
rename to tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json
diff --git a/tokenizer/plaintext/expected_output/output_ignore_punctuation.json b/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json
similarity index 100%
rename from tokenizer/plaintext/expected_output/output_ignore_punctuation.json
rename to tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json
diff --git a/tokenizer/plaintext/expected_output/output_to_lower.json b/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json
similarity index 100%
rename from tokenizer/plaintext/expected_output/output_to_lower.json
rename to tests/data/tokenizer/plaintext/expected_output/output_to_lower.json
diff --git a/tokenizer/plaintext/input.txt b/tests/data/tokenizer/plaintext/input.txt
similarity index 100%
rename from tokenizer/plaintext/input.txt
rename to tests/data/tokenizer/plaintext/input.txt
diff --git a/tests/setup.sh b/tests/setup.sh
index b226963..f18c0e3 100644
--- a/tests/setup.sh
+++ b/tests/setup.sh
@@ -1,8 +1,23 @@
 #!/usr/bin/env bash
 
-mkdir -p /usr/local/submitty/GIT_CHECKOUT/Lichen/
-cp -r * /usr/local/submitty/GIT_CHECKOUT/Lichen/
+lichen_repository_dir=/usr/local/submitty/GIT_CHECKOUT/Lichen/
+lichen_installation_dir=/usr/local/submitty/Lichen/
+lichen_data_dir=/var/local/submitty/courses/
 
-mkdir -p /usr/local/submitty/Lichen/
+# make a simulated GIT_CHECKOUT directory
+mkdir -p $lichen_repository_dir
+cp -r * $lichen_repository_dir
+cd $lichen_repository_dir
 
-bash /usr/local/submitty/GIT_CHECKOUT/Lichen/install_lichen.sh
+# install Lichen
+mkdir -p $lichen_installation_dir
+bash $lichen_repository_dir/install_lichen.sh
+
+# SETUP TOKENIZER TESTS ########################################################
+tokenizer_tests_course=$lichen_data_dir/f21/test_tokenizers/lichen/
+# make a simulated lichen path for the test_tokenizers course
+mkdir -p $tokenizer_tests_course
+
+# set up file structure for plaintext tokenizer tests
+# (doesn't need a full file structure, just a place to put files)
+mkdir -p $tokenizer_tests_course/plaintext_tokenizer_tests/
diff --git a/tests/tests.py b/tests/tests.py
index fa41488..be0077d 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,10 +1,24 @@
 import unittest
+import os
 
+lichen_repository_dir = "/usr/local/submitty/GIT_CHECKOUT/Lichen/"
+lichen_installation_dir = "/usr/local/submitty/Lichen/"
+lichen_data_dir = "/var/local/submitty/courses/"
 
-class TestTokenizers(unittest.TestCase):
 
+class TestPlaintextTokenizer(unittest.TestCase):
     def testPlaintextTokenizer(self):
-        print('test!')
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        # expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
+
+        command = f"{lichen_installation_dir}/plaintext_tokenizer.out {input_file} > {output_file}"
+        os.system(command)
+
+        with open(output_file) as file:
+            print(file.read())
+
+        os.remove(output_file)
 
 
 if __name__ == '__main__':

From 10dbed2e8d93cc53c5492f7f2cad25b15bbc9af2 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:32:31 -0400
Subject: [PATCH 26/52] need sudo for test

---
 .github/workflows/lichen_run.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index e8093ca..2b322e6 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -18,4 +18,4 @@ jobs:
           sudo bash ./tests/setup.sh
       - name: Run Tests
         run: |
-          python3 ./tests/tests.py
+          sudo python3 ./tests/tests.py

From b9b085754f6a9c85c633ab6e9f0cc5dbdb1ab217 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:39:27 -0400
Subject: [PATCH 27/52] Update tests.py

---
 tests/tests.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/tests.py b/tests/tests.py
index be0077d..286c8e8 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -8,6 +8,7 @@
 
 class TestPlaintextTokenizer(unittest.TestCase):
     def testPlaintextTokenizer(self):
+        print("test starting")
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         # expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
@@ -19,6 +20,7 @@ def testPlaintextTokenizer(self):
             print(file.read())
 
         os.remove(output_file)
+        print("test complete")
 
 
 if __name__ == '__main__':

From c50e01711723c0a2dd12c9c3198fd3f6c8515dcc Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:43:45 -0400
Subject: [PATCH 28/52] fix path

---
 tests/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests.py b/tests/tests.py
index 286c8e8..08ea2ed 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -13,7 +13,7 @@ def testPlaintextTokenizer(self):
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         # expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
 
-        command = f"{lichen_installation_dir}/plaintext_tokenizer.out {input_file} > {output_file}"
+        command = f"{lichen_installation_dir}plaintext_tokenizer.out {input_file} > {output_file}"
         os.system(command)
 
         with open(output_file) as file:

From 30df2b80146b35bedd474a56818edf686b96a93c Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:47:17 -0400
Subject: [PATCH 29/52] fix path

---
 tests/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests.py b/tests/tests.py
index 08ea2ed..8200e1c 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -13,7 +13,7 @@ def testPlaintextTokenizer(self):
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         # expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
 
-        command = f"{lichen_installation_dir}plaintext_tokenizer.out {input_file} > {output_file}"
+        command = f"{lichen_installation_dir}bin/plaintext_tokenizer.out {input_file} > {output_file}"
         os.system(command)
 
         with open(output_file) as file:

From 8554c60fbd971bb0d9a561ca84c1b4bb939c855e Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:50:36 -0400
Subject: [PATCH 30/52] Update tests.py

---
 tests/tests.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index 8200e1c..296383a 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -8,19 +8,17 @@
 
 class TestPlaintextTokenizer(unittest.TestCase):
     def testPlaintextTokenizer(self):
-        print("test starting")
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         # expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
 
-        command = f"{lichen_installation_dir}bin/plaintext_tokenizer.out {input_file} > {output_file}"
+        command = f"{lichen_installation_dir}bin/plaintext_tokenizer.out < {input_file} > {output_file}"
         os.system(command)
 
         with open(output_file) as file:
             print(file.read())
 
         os.remove(output_file)
-        print("test complete")
 
 
 if __name__ == '__main__':

From 32c7aa16cdad984e7d0461db998edafe8c67eb45 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:55:04 -0400
Subject: [PATCH 31/52] add assertion to implement test

---
 tests/tests.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index 296383a..c573507 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -10,14 +10,20 @@ class TestPlaintextTokenizer(unittest.TestCase):
     def testPlaintextTokenizer(self):
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
-        # expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
 
         command = f"{lichen_installation_dir}bin/plaintext_tokenizer.out < {input_file} > {output_file}"
         os.system(command)
 
         with open(output_file) as file:
-            print(file.read())
+            actual_output = file.read()
 
+        with open(expected_output_file) as file:
+            expected_output = file.read()
+
+        self.assertEqual(actual_output, expected_output)
+
+        # clean up the files
         os.remove(output_file)
 
 

From 5956ec4fedec06c7517f547e3d420fae43103669 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:57:04 -0400
Subject: [PATCH 32/52] fix more paths

---
 tests/tests.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index c573507..3dddb37 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,9 +1,9 @@
 import unittest
 import os
 
-lichen_repository_dir = "/usr/local/submitty/GIT_CHECKOUT/Lichen/"
-lichen_installation_dir = "/usr/local/submitty/Lichen/"
-lichen_data_dir = "/var/local/submitty/courses/"
+lichen_repository_dir = "/usr/local/submitty/GIT_CHECKOUT/Lichen"
+lichen_installation_dir = "/usr/local/submitty/Lichen"
+lichen_data_dir = "/var/local/submitty/courses"
 
 
 class TestPlaintextTokenizer(unittest.TestCase):
@@ -12,7 +12,7 @@ def testPlaintextTokenizer(self):
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
 
-        command = f"{lichen_installation_dir}bin/plaintext_tokenizer.out < {input_file} > {output_file}"
+        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out < {input_file} > {output_file}"
         os.system(command)
 
         with open(output_file) as file:

From 63df1904436b2e08e3f63491c9dc58abd9624035 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 11:58:57 -0400
Subject: [PATCH 33/52] fix another path issue

---
 tests/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests.py b/tests/tests.py
index 3dddb37..f9e5faa 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -10,7 +10,7 @@ class TestPlaintextTokenizer(unittest.TestCase):
     def testPlaintextTokenizer(self):
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out < {input_file} > {output_file}"
         os.system(command)

From 57db882fb9711c1fa4e36d9ab346cbdf393baa89 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 12:04:38 -0400
Subject: [PATCH 34/52] Add second test

---
 tests/tests.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/tests.py b/tests/tests.py
index f9e5faa..3f25942 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -26,6 +26,25 @@ def testPlaintextTokenizer(self):
         # clean up the files
         os.remove(output_file)
 
+    def testPlaintextTokenizerIgnoreNewlines(self):
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json"
+
+        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out < {input_file} > {output_file}"
+        os.system(command)
+
+        with open(output_file) as file:
+            actual_output = file.read()
+
+        with open(expected_output_file) as file:
+            expected_output = file.read()
+
+        self.assertEqual(actual_output, expected_output)
+
+        # clean up the files
+        os.remove(output_file)
+
 
 if __name__ == '__main__':
     unittest.main()

From 250d998aeaae647cc8387d928c6c885845ef9899 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 13:00:49 -0400
Subject: [PATCH 35/52] Update tests.py

---
 tests/tests.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/tests.py b/tests/tests.py
index 3f25942..de5a531 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -8,6 +8,8 @@
 
 class TestPlaintextTokenizer(unittest.TestCase):
     def testPlaintextTokenizer(self):
+        self.maxDiff = None
+
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output.json"
@@ -27,6 +29,8 @@ def testPlaintextTokenizer(self):
         os.remove(output_file)
 
     def testPlaintextTokenizerIgnoreNewlines(self):
+        self.maxDiff = None
+
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json"

From f452fa4ddbb5f28511e3f546a63ce852848bcd70 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 13:05:22 -0400
Subject: [PATCH 36/52] it's important to run the right command to get the
 right results...

---
 tests/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests.py b/tests/tests.py
index de5a531..c7c2f5d 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -35,7 +35,7 @@ def testPlaintextTokenizerIgnoreNewlines(self):
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json"
 
-        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out < {input_file} > {output_file}"
+        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_newlines < {input_file} > {output_file}"
         os.system(command)
 
         with open(output_file) as file:

From b8103feb450608f90745af13daede85fd18f68ba Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 13:12:47 -0400
Subject: [PATCH 37/52] Add third test

---
 tests/tests.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/tests.py b/tests/tests.py
index c7c2f5d..5973224 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -49,6 +49,27 @@ def testPlaintextTokenizerIgnoreNewlines(self):
         # clean up the files
         os.remove(output_file)
 
+    def testPlaintextTokenizerIgnoreEverything(self):
+        self.maxDiff = None
+
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_everything.json"
+
+        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation --to_lower --ignore_numbers --ignore_newlines  < {input_file} > {output_file}"
+        os.system(command)
+
+        with open(output_file) as file:
+            actual_output = file.read()
+
+        with open(expected_output_file) as file:
+            expected_output = file.read()
+
+        self.assertEqual(actual_output, expected_output)
+
+        # clean up the files
+        os.remove(output_file)
+
 
 if __name__ == '__main__':
     unittest.main()

From 0efa16e56de35270306c960dd266c71235f69c36 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Sat, 3 Jul 2021 13:22:13 -0400
Subject: [PATCH 38/52] Add remaining plaintext tokenizer tests

---
 tests/tests.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/tests/tests.py b/tests/tests.py
index 5973224..4c155a1 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -28,6 +28,48 @@ def testPlaintextTokenizer(self):
         # clean up the files
         os.remove(output_file)
 
+    def testPlaintextTokenizerIgnorePunctuation(self):
+        self.maxDiff = None
+
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json"
+
+        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation < {input_file} > {output_file}"
+        os.system(command)
+
+        with open(output_file) as file:
+            actual_output = file.read()
+
+        with open(expected_output_file) as file:
+            expected_output = file.read()
+
+        self.assertEqual(actual_output, expected_output)
+
+        # clean up the files
+        os.remove(output_file)
+
+    def testPlaintextTokenizerToLower(self):
+        self.maxDiff = None
+
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json"
+
+        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation < {input_file} > {output_file}"
+        os.system(command)
+
+        with open(output_file) as file:
+            actual_output = file.read()
+
+        with open(expected_output_file) as file:
+            expected_output = file.read()
+
+        self.assertEqual(actual_output, expected_output)
+
+        # clean up the files
+        os.remove(output_file)
+
     def testPlaintextTokenizerIgnoreNewlines(self):
         self.maxDiff = None
 

From 24d97bd46968f97f401d9499210435e0ec654e48 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 5 Jul 2021 09:47:47 -0400
Subject: [PATCH 39/52] Add C tokenizer tests

---
 .github/workflows/lichen_run.yml              |  1 +
 .../tokenizer}/c/expected_output/output.json  |  0
 .../data/tokenizer}/c/input.cpp               |  0
 tests/setup.sh                                |  7 ++--
 tests/tests.py                                | 33 ++++++++++++++++---
 5 files changed, 31 insertions(+), 10 deletions(-)
 rename {tokenizer => tests/data/tokenizer}/c/expected_output/output.json (100%)
 rename {tokenizer => tests/data/tokenizer}/c/input.cpp (100%)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 2b322e6..d0fc560 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -13,6 +13,7 @@ jobs:
       - name: Install Dependencies
         run: |
           sudo apt install libboost-all-dev
+          sudo apt-get install python-clang-3.8
       - name: Create Directory Structure
         run: |
           sudo bash ./tests/setup.sh
diff --git a/tokenizer/c/expected_output/output.json b/tests/data/tokenizer/c/expected_output/output.json
similarity index 100%
rename from tokenizer/c/expected_output/output.json
rename to tests/data/tokenizer/c/expected_output/output.json
diff --git a/tokenizer/c/input.cpp b/tests/data/tokenizer/c/input.cpp
similarity index 100%
rename from tokenizer/c/input.cpp
rename to tests/data/tokenizer/c/input.cpp
diff --git a/tests/setup.sh b/tests/setup.sh
index f18c0e3..fe0be66 100644
--- a/tests/setup.sh
+++ b/tests/setup.sh
@@ -15,9 +15,6 @@ bash $lichen_repository_dir/install_lichen.sh
 
 # SETUP TOKENIZER TESTS ########################################################
 tokenizer_tests_course=$lichen_data_dir/f21/test_tokenizers/lichen/
-# make a simulated lichen path for the test_tokenizers course
-mkdir -p $tokenizer_tests_course
-
-# set up file structure for plaintext tokenizer tests
+# set up file structure for tokenizer tests
 # (doesn't need a full file structure, just a place to put files)
-mkdir -p $tokenizer_tests_course/plaintext_tokenizer_tests/
+mkdir -p $tokenizer_tests_course
diff --git a/tests/tests.py b/tests/tests.py
index 4c155a1..842d448 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -11,7 +11,7 @@ def testPlaintextTokenizer(self):
         self.maxDiff = None
 
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out < {input_file} > {output_file}"
@@ -32,7 +32,7 @@ def testPlaintextTokenizerIgnorePunctuation(self):
         self.maxDiff = None
 
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation < {input_file} > {output_file}"
@@ -53,7 +53,7 @@ def testPlaintextTokenizerToLower(self):
         self.maxDiff = None
 
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation < {input_file} > {output_file}"
@@ -74,7 +74,7 @@ def testPlaintextTokenizerIgnoreNewlines(self):
         self.maxDiff = None
 
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_newlines < {input_file} > {output_file}"
@@ -95,7 +95,7 @@ def testPlaintextTokenizerIgnoreEverything(self):
         self.maxDiff = None
 
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/plaintext_tokenizer_tests/output.json"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_everything.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation --to_lower --ignore_numbers --ignore_newlines  < {input_file} > {output_file}"
@@ -113,5 +113,28 @@ def testPlaintextTokenizerIgnoreEverything(self):
         os.remove(output_file)
 
 
+class TestCTokenizer(unittest.TestCase):
+    def testCTokenizer(self):
+        self.maxDiff = None
+
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/c/input.txt"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/c/expected_output/output.json"
+
+        command = f"python3 {lichen_installation_dir}/bin/c_tokenizer.py {input_file} > {output_file}"
+        os.system(command)
+
+        with open(output_file) as file:
+            actual_output = file.read()
+
+        with open(expected_output_file) as file:
+            expected_output = file.read()
+
+        self.assertEqual(actual_output, expected_output)
+
+        # clean up the files
+        os.remove(output_file)
+
+
 if __name__ == '__main__':
     unittest.main()

From a50ef676e2e15861fbb14e216545688d8a86c444 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 5 Jul 2021 09:54:15 -0400
Subject: [PATCH 40/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index d0fc560..aba242d 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Install Dependencies
         run: |
           sudo apt install libboost-all-dev
-          sudo apt-get install python-clang-3.8
+          sudo apt install python-clang-3.8
       - name: Create Directory Structure
         run: |
           sudo bash ./tests/setup.sh

From ed6415394df3bfe9c2453858b3ba4fbba1de85fc Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 5 Jul 2021 09:55:10 -0400
Subject: [PATCH 41/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index aba242d..2b322e6 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -13,7 +13,6 @@ jobs:
       - name: Install Dependencies
         run: |
           sudo apt install libboost-all-dev
-          sudo apt install python-clang-3.8
       - name: Create Directory Structure
         run: |
           sudo bash ./tests/setup.sh

From 7300b4ec949cd93f5cd8010c439831cf6e752479 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 5 Jul 2021 09:58:04 -0400
Subject: [PATCH 42/52] Update lichen_run.yml

---
 .github/workflows/lichen_run.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 2b322e6..9c5296a 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -13,6 +13,7 @@ jobs:
       - name: Install Dependencies
         run: |
           sudo apt install libboost-all-dev
+          pip install clang
       - name: Create Directory Structure
         run: |
           sudo bash ./tests/setup.sh

From 2066fa3f05b9633dc185f88077f82eaf4ea1cba0 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 5 Jul 2021 10:05:44 -0400
Subject: [PATCH 43/52] Add MIPS tokenizer

---
 .github/workflows/lichen_run.yml              |  3 +--
 .../mips/example_output/output.json           |  0
 .../data/tokenizer}/mips/input.s              |  0
 tests/tests.py                                | 25 ++++++++++++++++++-
 4 files changed, 25 insertions(+), 3 deletions(-)
 rename {tokenizer => tests/data/tokenizer}/mips/example_output/output.json (100%)
 rename {tokenizer => tests/data/tokenizer}/mips/input.s (100%)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 9c5296a..20efcb0 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -4,7 +4,7 @@ on: [push]
 
 jobs:
   test-lichen:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2
@@ -13,7 +13,6 @@ jobs:
       - name: Install Dependencies
         run: |
           sudo apt install libboost-all-dev
-          pip install clang
       - name: Create Directory Structure
         run: |
           sudo bash ./tests/setup.sh
diff --git a/tokenizer/mips/example_output/output.json b/tests/data/tokenizer/mips/example_output/output.json
similarity index 100%
rename from tokenizer/mips/example_output/output.json
rename to tests/data/tokenizer/mips/example_output/output.json
diff --git a/tokenizer/mips/input.s b/tests/data/tokenizer/mips/input.s
similarity index 100%
rename from tokenizer/mips/input.s
rename to tests/data/tokenizer/mips/input.s
diff --git a/tests/tests.py b/tests/tests.py
index 842d448..3e8cd74 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -117,7 +117,7 @@ class TestCTokenizer(unittest.TestCase):
     def testCTokenizer(self):
         self.maxDiff = None
 
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/c/input.txt"
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/c/input.cpp"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/c/expected_output/output.json"
 
@@ -136,5 +136,28 @@ def testCTokenizer(self):
         os.remove(output_file)
 
 
+class TestMIPSTokenizer(unittest.TestCase):
+    def testMIPSTokenizer(self):
+        self.maxDiff = None
+
+        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/input.s"
+        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
+        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/expected_output/output.json"
+
+        command = f"python3 {lichen_installation_dir}/bin/mips_tokenizer.py {input_file} > {output_file}"
+        os.system(command)
+
+        with open(output_file) as file:
+            actual_output = file.read()
+
+        with open(expected_output_file) as file:
+            expected_output = file.read()
+
+        self.assertEqual(actual_output, expected_output)
+
+        # clean up the files
+        os.remove(output_file)
+
+
 if __name__ == '__main__':
     unittest.main()

From d26b144f4ce5596e0fe05ca5489c61ea001fd060 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 5 Jul 2021 10:09:20 -0400
Subject: [PATCH 44/52] Update tests.py

---
 tests/tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/tests.py b/tests/tests.py
index 3e8cd74..c0c781a 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -140,6 +140,7 @@ class TestMIPSTokenizer(unittest.TestCase):
     def testMIPSTokenizer(self):
         self.maxDiff = None
 
+
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/input.s"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/expected_output/output.json"

From bb545d8d414e8b54a98cd45878881133a0b5a083 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 5 Jul 2021 10:23:09 -0400
Subject: [PATCH 45/52] Update tests.py

---
 tests/tests.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/tests.py b/tests/tests.py
index c0c781a..3e8cd74 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -140,7 +140,6 @@ class TestMIPSTokenizer(unittest.TestCase):
     def testMIPSTokenizer(self):
         self.maxDiff = None
 
-
         input_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/input.s"
         output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
         expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/expected_output/output.json"

From 4abfaf18714023f7aac222e54704a2cb9453cf1b Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 19 Jul 2021 10:32:02 -0400
Subject: [PATCH 46/52] Fix paths in tests.py such that it can be run in
 vagrant

---
 .gitignore                                    |  1 +
 .../output.json                               |  0
 .../plaintext/expected_output/output.json     |  6 +-
 .../output_ignore_newlines.json               |  6 +-
 .../output_ignore_punctuation.json            |  6 +-
 .../expected_output/output_to_lower.json      |  6 +-
 tests/tests.py                                | 77 ++++++++-----------
 7 files changed, 47 insertions(+), 55 deletions(-)
 rename tests/data/tokenizer/mips/{example_output => expected_output}/output.json (100%)

diff --git a/.gitignore b/.gitignore
index 7080991..bc4bed5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *~
 tools/assignments/*
+tests/__pycache__
diff --git a/tests/data/tokenizer/mips/example_output/output.json b/tests/data/tokenizer/mips/expected_output/output.json
similarity index 100%
rename from tests/data/tokenizer/mips/example_output/output.json
rename to tests/data/tokenizer/mips/expected_output/output.json
diff --git a/tests/data/tokenizer/plaintext/expected_output/output.json b/tests/data/tokenizer/plaintext/expected_output/output.json
index 0a04cad..277632f 100644
--- a/tests/data/tokenizer/plaintext/expected_output/output.json
+++ b/tests/data/tokenizer/plaintext/expected_output/output.json
@@ -51,7 +51,7 @@
         "char": 20,
         "line": 3,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 22,
@@ -159,7 +159,7 @@
         "char": 26,
         "line": 4,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 27,
@@ -171,7 +171,7 @@
         "char": 28,
         "line": 4,
         "type": "number",
-        "value": "2"
+        "value": 2
     },
     {
         "char": 29,
diff --git a/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json b/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json
index 35f4422..68e4a4a 100644
--- a/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json
+++ b/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json
@@ -39,7 +39,7 @@
         "char": 20,
         "line": 3,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 22,
@@ -141,7 +141,7 @@
         "char": 26,
         "line": 4,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 27,
@@ -153,7 +153,7 @@
         "char": 28,
         "line": 4,
         "type": "number",
-        "value": "2"
+        "value": 2
     },
     {
         "char": 29,
diff --git a/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json b/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json
index 341d794..0d218cb 100644
--- a/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json
+++ b/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json
@@ -51,7 +51,7 @@
         "char": 20,
         "line": 3,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 23,
@@ -123,7 +123,7 @@
         "char": 26,
         "line": 4,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 27,
@@ -135,7 +135,7 @@
         "char": 28,
         "line": 4,
         "type": "number",
-        "value": "2"
+        "value": 2
     },
     {
         "char": 29,
diff --git a/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json b/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json
index 14b0da1..2c7721e 100644
--- a/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json
+++ b/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json
@@ -51,7 +51,7 @@
         "char": 20,
         "line": 3,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 22,
@@ -159,7 +159,7 @@
         "char": 26,
         "line": 4,
         "type": "number",
-        "value": "1"
+        "value": 1
     },
     {
         "char": 27,
@@ -171,7 +171,7 @@
         "char": 28,
         "line": 4,
         "type": "number",
-        "value": "2"
+        "value": 2
     },
     {
         "char": 29,
diff --git a/tests/tests.py b/tests/tests.py
index 3e8cd74..8034ef8 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,18 +1,25 @@
 import unittest
 import os
+import shutil
 
-lichen_repository_dir = "/usr/local/submitty/GIT_CHECKOUT/Lichen"
 lichen_installation_dir = "/usr/local/submitty/Lichen"
-lichen_data_dir = "/var/local/submitty/courses"
+lichen_test_playground = "/usr/local/submitty/Lichen/test_output"
 
 
 class TestPlaintextTokenizer(unittest.TestCase):
+    def setUp(self):
+        if not os.path.isdir(os.path.join(lichen_test_playground, 'plaintext_tokenizer')):
+            os.makedirs(os.path.join(lichen_test_playground, 'plaintext_tokenizer'))
+
+    def tearDown(self):
+        shutil.rmtree(os.path.join(lichen_test_playground, 'plaintext_tokenizer'))
+
     def testPlaintextTokenizer(self):
         self.maxDiff = None
 
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output.json"
+        input_file = "./data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_test_playground}/plaintext_tokenizer/output.json"
+        expected_output_file = "./data/tokenizer/plaintext/expected_output/output.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out < {input_file} > {output_file}"
         os.system(command)
@@ -31,9 +38,9 @@ def testPlaintextTokenizer(self):
     def testPlaintextTokenizerIgnorePunctuation(self):
         self.maxDiff = None
 
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json"
+        input_file = "./data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_test_playground}/plaintext_tokenizer/output.json"
+        expected_output_file = "./data/tokenizer/plaintext/expected_output/output_ignore_punctuation.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation < {input_file} > {output_file}"
         os.system(command)
@@ -52,11 +59,11 @@ def testPlaintextTokenizerIgnorePunctuation(self):
     def testPlaintextTokenizerToLower(self):
         self.maxDiff = None
 
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_to_lower.json"
+        input_file = "./data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_test_playground}/plaintext_tokenizer/output.json"
+        expected_output_file = "./data/tokenizer/plaintext/expected_output/output_to_lower.json"
 
-        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation < {input_file} > {output_file}"
+        command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --to_lower < {input_file} > {output_file}"
         os.system(command)
 
         with open(output_file) as file:
@@ -73,9 +80,9 @@ def testPlaintextTokenizerToLower(self):
     def testPlaintextTokenizerIgnoreNewlines(self):
         self.maxDiff = None
 
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_newlines.json"
+        input_file = "./data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_test_playground}/plaintext_tokenizer/output.json"
+        expected_output_file = "./data/tokenizer/plaintext/expected_output/output_ignore_newlines.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_newlines < {input_file} > {output_file}"
         os.system(command)
@@ -94,9 +101,9 @@ def testPlaintextTokenizerIgnoreNewlines(self):
     def testPlaintextTokenizerIgnoreEverything(self):
         self.maxDiff = None
 
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/input.txt"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/plaintext/expected_output/output_ignore_everything.json"
+        input_file = "./data/tokenizer/plaintext/input.txt"
+        output_file = f"{lichen_test_playground}/plaintext_tokenizer/output.json"
+        expected_output_file = "./data/tokenizer/plaintext/expected_output/output_ignore_everything.json"
 
         command = f"{lichen_installation_dir}/bin/plaintext_tokenizer.out --ignore_punctuation --to_lower --ignore_numbers --ignore_newlines  < {input_file} > {output_file}"
         os.system(command)
@@ -113,36 +120,20 @@ def testPlaintextTokenizerIgnoreEverything(self):
         os.remove(output_file)
 
 
-class TestCTokenizer(unittest.TestCase):
-    def testCTokenizer(self):
-        self.maxDiff = None
-
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/c/input.cpp"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/c/expected_output/output.json"
-
-        command = f"python3 {lichen_installation_dir}/bin/c_tokenizer.py {input_file} > {output_file}"
-        os.system(command)
-
-        with open(output_file) as file:
-            actual_output = file.read()
-
-        with open(expected_output_file) as file:
-            expected_output = file.read()
-
-        self.assertEqual(actual_output, expected_output)
-
-        # clean up the files
-        os.remove(output_file)
+class TestMIPSTokenizer(unittest.TestCase):
+    def setUp(self):
+        if not os.path.isdir(os.path.join(lichen_test_playground, 'mips_tokenizer')):
+            os.makedirs(os.path.join(lichen_test_playground, 'mips_tokenizer'))
 
+    def tearDown(self):
+        shutil.rmtree(os.path.join(lichen_test_playground, 'mips_tokenizer'))
 
-class TestMIPSTokenizer(unittest.TestCase):
     def testMIPSTokenizer(self):
         self.maxDiff = None
 
-        input_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/input.s"
-        output_file = f"{lichen_data_dir}/f21/test_tokenizers/lichen/output.json"
-        expected_output_file = f"{lichen_repository_dir}/tests/data/tokenizer/mips/expected_output/output.json"
+        input_file = "./data/tokenizer/mips/input.s"
+        output_file = f"{lichen_test_playground}/mips_tokenizer/output.json"
+        expected_output_file = "./data/tokenizer/mips/expected_output/output.json"
 
         command = f"python3 {lichen_installation_dir}/bin/mips_tokenizer.py {input_file} > {output_file}"
         os.system(command)

From be7dd8b805518a7925c7143f4d84e3b4475abf48 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 19 Jul 2021 10:38:44 -0400
Subject: [PATCH 47/52] Fix github actions

---
 .github/workflows/lichen_run.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/lichen_run.yml b/.github/workflows/lichen_run.yml
index 20efcb0..c6b9da1 100644
--- a/.github/workflows/lichen_run.yml
+++ b/.github/workflows/lichen_run.yml
@@ -18,4 +18,5 @@ jobs:
           sudo bash ./tests/setup.sh
       - name: Run Tests
         run: |
-          sudo python3 ./tests/tests.py
+          cd /usr/local/submitty/GIT_CHECKOUT/Lichen/tests
+          sudo python3 -m unittest discover

From 0ef49f80a33c75353a20942fd7fe2261c383f365 Mon Sep 17 00:00:00 2001
From: sbelsk <shellybelsky02@gmail.com>
Date: Mon, 19 Jul 2021 11:01:29 -0400
Subject: [PATCH 48/52] Add hash all test

---
 tests/data/hash_all/config.json    |   4 +
 tests/data/hash_all/submission.txt |   4 +
 tests/data/hash_all/tokens.json    | 158 +++++++++++++++++++++++++++++
 tests/tests.py                     |  58 +++++++++++
 4 files changed, 224 insertions(+)
 create mode 100644 tests/data/hash_all/config.json
 create mode 100644 tests/data/hash_all/submission.txt
 create mode 100644 tests/data/hash_all/tokens.json

diff --git a/tests/data/hash_all/config.json b/tests/data/hash_all/config.json
new file mode 100644
index 0000000..1ecc52c
--- /dev/null
+++ b/tests/data/hash_all/config.json
@@ -0,0 +1,4 @@
+{
+    "language": "plaintext",
+    "sequence_length": 2
+}
diff --git a/tests/data/hash_all/submission.txt b/tests/data/hash_all/submission.txt
new file mode 100644
index 0000000..2100e54
--- /dev/null
+++ b/tests/data/hash_all/submission.txt
@@ -0,0 +1,4 @@
+int x = 8;
+int y = 3;
+int z = x + y;
+int t = 2 * x + y;
diff --git a/tests/data/hash_all/tokens.json b/tests/data/hash_all/tokens.json
new file mode 100644
index 0000000..98b2040
--- /dev/null
+++ b/tests/data/hash_all/tokens.json
@@ -0,0 +1,158 @@
+[
+    {
+        "char": 1,
+        "line": 1,
+        "type": "string",
+        "value": "int"
+    },
+    {
+        "char": 5,
+        "line": 1,
+        "type": "string",
+        "value": "x"
+    },
+    {
+        "char": 7,
+        "line": 1,
+        "type": "punctuation",
+        "value": "="
+    },
+    {
+        "char": 9,
+        "line": 1,
+        "type": "number",
+        "value": 8
+    },
+    {
+        "char": 10,
+        "line": 1,
+        "type": "punctuation",
+        "value": ";"
+    },
+    {
+        "char": 1,
+        "line": 2,
+        "type": "string",
+        "value": "int"
+    },
+    {
+        "char": 5,
+        "line": 2,
+        "type": "string",
+        "value": "y"
+    },
+    {
+        "char": 7,
+        "line": 2,
+        "type": "punctuation",
+        "value": "="
+    },
+    {
+        "char": 9,
+        "line": 2,
+        "type": "number",
+        "value": 3
+    },
+    {
+        "char": 10,
+        "line": 2,
+        "type": "punctuation",
+        "value": ";"
+    },
+    {
+        "char": 1,
+        "line": 3,
+        "type": "string",
+        "value": "int"
+    },
+    {
+        "char": 5,
+        "line": 3,
+        "type": "string",
+        "value": "z"
+    },
+    {
+        "char": 7,
+        "line": 3,
+        "type": "punctuation",
+        "value": "="
+    },
+    {
+        "char": 9,
+        "line": 3,
+        "type": "string",
+        "value": "x"
+    },
+    {
+        "char": 11,
+        "line": 3,
+        "type": "punctuation",
+        "value": "+"
+    },
+    {
+        "char": 13,
+        "line": 3,
+        "type": "string",
+        "value": "y"
+    },
+    {
+        "char": 14,
+        "line": 3,
+        "type": "punctuation",
+        "value": ";"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "string",
+        "value": "int"
+    },
+    {
+        "char": 5,
+        "line": 4,
+        "type": "string",
+        "value": "t"
+    },
+    {
+        "char": 7,
+        "line": 4,
+        "type": "punctuation",
+        "value": "="
+    },
+    {
+        "char": 9,
+        "line": 4,
+        "type": "number",
+        "value": 2
+    },
+    {
+        "char": 11,
+        "line": 4,
+        "type": "punctuation",
+        "value": "*"
+    },
+    {
+        "char": 13,
+        "line": 4,
+        "type": "string",
+        "value": "x"
+    },
+    {
+        "char": 15,
+        "line": 4,
+        "type": "punctuation",
+        "value": "+"
+    },
+    {
+        "char": 17,
+        "line": 4,
+        "type": "string",
+        "value": "y"
+    },
+    {
+        "char": 18,
+        "line": 4,
+        "type": "punctuation",
+        "value": ";"
+    }
+]
diff --git a/tests/tests.py b/tests/tests.py
index 3e8cd74..6678ed7 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,5 +1,8 @@
 import unittest
 import os
+import shutil
+import subprocess
+import json
 
 lichen_repository_dir = "/usr/local/submitty/GIT_CHECKOUT/Lichen"
 lichen_installation_dir = "/usr/local/submitty/Lichen"
@@ -159,5 +162,60 @@ def testMIPSTokenizer(self):
         os.remove(output_file)
 
 
+class TestHashAll(unittest.TestCase):
+    def setUp(self):
+        os.makedirs("/usr/local/submitty/Lichen/test_output")
+
+    def tearDown(self):
+        shutil.rmtree("/usr/local/submitty/Lichen/test_output")
+
+    def testHashAll(self):
+        # make the fake directory structure hash_all.p expects
+        os.makedirs("/usr/local/submitty/Lichen/test_output/test_hash_all/provided_code")
+        os.makedirs("/usr/local/submitty/Lichen/test_output/test_hash_all/other_gradeables")
+        os.makedirs("/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1")
+        open("/usr/local/submitty/Lichen/test_output/test_hash_all/config.json", 'a').close()
+        open("/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/tokens.json", 'a').close()
+        with open("/usr/local/submitty/Lichen/test_output/test_hash_all/provided_code/tokens.json", 'w') as file:
+            file.write("null")
+
+        # copy the input files from /data to the the new path
+        shutil.copyfile("data/hash_all/a/config.json", "/usr/local/submitty/Lichen/test_output/test_hash_all/config.json")
+        shutil.copyfile("data/hash_all/a/tokens.json", "/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/tokens.json")
+
+        # save current working directory
+        cwd = os.getcwd()
+
+        # run hash_all
+        os.chdir("/usr/local/submitty/Lichen/bin")
+        os.system("python3 /usr/local/submitty/Lichen/bin/hash_all.py /usr/local/submitty/Lichen/test_output/test_hash_all")
+        os.chdir(cwd)
+
+        # test output
+        hashes_file = "/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/hashes.txt"
+        with open(hashes_file, 'r') as file:
+            lines = file.readlines()
+
+        lines = [x.strip() for x in lines]
+
+        tokens_file = "/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/tokens.json"
+        with open(tokens_file, 'r') as file:
+            tokens = json.load(file)
+        self.assertEqual(len(lines), len(tokens) - 2 + 1)
+        # make sure the same sequences hash to the same string, and
+        # that different sequences hash to different strings
+        for i in range(0, len(lines)):
+            for j in range(i + 1, len(lines)):
+                if i == 4 and j == 9\
+                 or i == 4 and j == 16\
+                 or i == 9 and j == 16\
+                 or i == 13 and j == 22\
+                 or i == 14 and j == 23\
+                 or i == 15 and j == 24:
+                    self.assertEqual(lines[i], lines[j])
+                else:
+                    self.assertNotEqual(lines[i], lines[j])
+
+
 if __name__ == '__main__':
     unittest.main()

From 49f1b36020fea4e86dbed29bc32a01374a570fbd Mon Sep 17 00:00:00 2001
From: sbelsk <shellybelsky02@gmail.com>
Date: Mon, 19 Jul 2021 11:42:24 -0400
Subject: [PATCH 49/52] change paths

---
 tests/tests.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index e4c9ed6..f4d11c4 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -154,44 +154,47 @@ def testMIPSTokenizer(self):
 
 class TestHashAll(unittest.TestCase):
     def setUp(self):
-        os.makedirs("/usr/local/submitty/Lichen/test_output")
+        if not os.path.isdir(lichen_test_playground):
+            os.makedirs(lichen_test_playground)
 
     def tearDown(self):
-        shutil.rmtree("/usr/local/submitty/Lichen/test_output")
+        shutil.rmtree(lichen_test_playground)
 
     def testHashAll(self):
         # make the fake directory structure hash_all.p expects
-        os.makedirs("/usr/local/submitty/Lichen/test_output/test_hash_all/provided_code")
-        os.makedirs("/usr/local/submitty/Lichen/test_output/test_hash_all/other_gradeables")
-        os.makedirs("/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1")
-        open("/usr/local/submitty/Lichen/test_output/test_hash_all/config.json", 'a').close()
-        open("/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/tokens.json", 'a').close()
-        with open("/usr/local/submitty/Lichen/test_output/test_hash_all/provided_code/tokens.json", 'w') as file:
+        os.makedirs(f"{lichen_test_playground}/test_hash_all/provided_code")
+        os.makedirs(f"{lichen_test_playground}/test_hash_all/other_gradeables")
+        os.makedirs(f"{lichen_test_playground}/test_hash_all/users/student/1")
+        open(f"{lichen_test_playground}/test_hash_all/config.json", 'a').close()
+        open(f"{lichen_test_playground}/test_hash_all/users/student/1/tokens.json", 'a').close()
+        with open(f"{lichen_test_playground}/test_hash_all/provided_code/tokens.json", 'w') as file:
             file.write("null")
 
         # copy the input files from /data to the the new path
-        shutil.copyfile("data/hash_all/a/config.json", "/usr/local/submitty/Lichen/test_output/test_hash_all/config.json")
-        shutil.copyfile("data/hash_all/a/tokens.json", "/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/tokens.json")
+        shutil.copyfile("data/hash_all/config.json", f"{lichen_test_playground}/test_hash_all/config.json")
+        shutil.copyfile("data/hash_all/tokens.json", f"{lichen_test_playground}/test_hash_all/users/student/1/tokens.json")
 
         # save current working directory
         cwd = os.getcwd()
 
         # run hash_all
-        os.chdir("/usr/local/submitty/Lichen/bin")
-        os.system("python3 /usr/local/submitty/Lichen/bin/hash_all.py /usr/local/submitty/Lichen/test_output/test_hash_all")
+        os.chdir(f"{lichen_installation_dir}/bin")
+        # TODO: make this not print to stdout
+        os.system(f"python3 {lichen_installation_dir}/bin/hash_all.py {lichen_test_playground}/test_hash_all")
         os.chdir(cwd)
 
         # test output
-        hashes_file = "/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/hashes.txt"
+        hashes_file = f"{lichen_test_playground}/test_hash_all/users/student/1/hashes.txt"
         with open(hashes_file, 'r') as file:
             lines = file.readlines()
-
         lines = [x.strip() for x in lines]
-
-        tokens_file = "/usr/local/submitty/Lichen/test_output/test_hash_all/users/student/1/tokens.json"
+        tokens_file = f"{lichen_test_playground}/test_hash_all/users/student/1/tokens.json"
         with open(tokens_file, 'r') as file:
             tokens = json.load(file)
+
+        # make sure the number of sequences and the number of hashes are the same
         self.assertEqual(len(lines), len(tokens) - 2 + 1)
+
         # make sure the same sequences hash to the same string, and
         # that different sequences hash to different strings
         for i in range(0, len(lines)):

From 154e2a26e9e0d5151214f7c588da57ba26dab37b Mon Sep 17 00:00:00 2001
From: sbelsk <shellybelsky02@gmail.com>
Date: Mon, 19 Jul 2021 13:00:06 -0400
Subject: [PATCH 50/52] Get rid of unwanted stdout

---
 tests/tests.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index f4d11c4..1c71032 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,7 +1,6 @@
 import unittest
 import os
 import shutil
-import subprocess
 import json
 
 lichen_installation_dir = "/usr/local/submitty/Lichen"
@@ -179,8 +178,7 @@ def testHashAll(self):
 
         # run hash_all
         os.chdir(f"{lichen_installation_dir}/bin")
-        # TODO: make this not print to stdout
-        os.system(f"python3 {lichen_installation_dir}/bin/hash_all.py {lichen_test_playground}/test_hash_all")
+        os.system(f"python3 {lichen_installation_dir}/bin/hash_all.py {lichen_test_playground}/test_hash_all > /dev/null")
         os.chdir(cwd)
 
         # test output

From 6065d736b3ea5d1e7c8c5f349b1d02d9b8a9d771 Mon Sep 17 00:00:00 2001
From: williamjallen <dalearn@icloud.com>
Date: Mon, 19 Jul 2021 15:56:20 -0400
Subject: [PATCH 51/52] Remove old code

---
 tests/tests.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index 8034ef8..13cff79 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -32,9 +32,6 @@ def testPlaintextTokenizer(self):
 
         self.assertEqual(actual_output, expected_output)
 
-        # clean up the files
-        os.remove(output_file)
-
     def testPlaintextTokenizerIgnorePunctuation(self):
         self.maxDiff = None
 
@@ -53,9 +50,6 @@ def testPlaintextTokenizerIgnorePunctuation(self):
 
         self.assertEqual(actual_output, expected_output)
 
-        # clean up the files
-        os.remove(output_file)
-
     def testPlaintextTokenizerToLower(self):
         self.maxDiff = None
 
@@ -74,9 +68,6 @@ def testPlaintextTokenizerToLower(self):
 
         self.assertEqual(actual_output, expected_output)
 
-        # clean up the files
-        os.remove(output_file)
-
     def testPlaintextTokenizerIgnoreNewlines(self):
         self.maxDiff = None
 
@@ -95,9 +86,6 @@ def testPlaintextTokenizerIgnoreNewlines(self):
 
         self.assertEqual(actual_output, expected_output)
 
-        # clean up the files
-        os.remove(output_file)
-
     def testPlaintextTokenizerIgnoreEverything(self):
         self.maxDiff = None
 
@@ -116,9 +104,6 @@ def testPlaintextTokenizerIgnoreEverything(self):
 
         self.assertEqual(actual_output, expected_output)
 
-        # clean up the files
-        os.remove(output_file)
-
 
 class TestMIPSTokenizer(unittest.TestCase):
     def setUp(self):
@@ -146,9 +131,6 @@ def testMIPSTokenizer(self):
 
         self.assertEqual(actual_output, expected_output)
 
-        # clean up the files
-        os.remove(output_file)
-
 
 if __name__ == '__main__':
     unittest.main()

From a6719b538a794f5d29b5ab7f70f8ecbddbe944ae Mon Sep 17 00:00:00 2001
From: William Allen <dalearn@icloud.com>
Date: Mon, 19 Jul 2021 16:29:14 -0400
Subject: [PATCH 52/52] comment with missing letter was bugging me

---
 tests/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests.py b/tests/tests.py
index 0bd59c1..9257d05 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -142,7 +142,7 @@ def tearDown(self):
         shutil.rmtree(lichen_test_playground)
 
     def testHashAll(self):
-        # make the fake directory structure hash_all.p expects
+        # make the fake directory structure hash_all.py expects
         os.makedirs(f"{lichen_test_playground}/test_hash_all/provided_code")
         os.makedirs(f"{lichen_test_playground}/test_hash_all/other_gradeables")
         os.makedirs(f"{lichen_test_playground}/test_hash_all/users/student/1")