Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions bin/concatenate_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
IGNORED_FILES = [
".submit.timestamp"
]
MAX_CONCAT_SIZE = 1e9

with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
LICHEN_CONFIG = json.load(lichen_config_file)


# returns a string containing the contents of the files which match the regex in the specified dir
Expand Down Expand Up @@ -45,8 +47,9 @@ def getConcatFilesInDir(input_dir, regex_patterns):


def checkTotalSize(total_concat):
if total_concat > MAX_CONCAT_SIZE:
raise SystemExit(f"ERROR! exceeded {humanize.naturalsize(MAX_CONCAT_SIZE)}"
if total_concat > LICHEN_CONFIG['concat_max_total_bytes']:
raise SystemExit("ERROR! exceeded"
f"{humanize.naturalsize(LICHEN_CONFIG['concat_max_total_bytes'])}"
" of concatenated files allowed")


Expand Down
50 changes: 29 additions & 21 deletions bin/hash_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import json
import time
import hashlib
from pathlib import Path


def parse_args():
Expand All @@ -18,9 +19,9 @@ def parse_args():
return parser.parse_args()


def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
language = lichen_config_data["language"]
sequence_length = int(lichen_config_data["sequence_length"])
def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
language = lichen_run_config["language"]
sequence_length = int(lichen_run_config["sequence_length"])

data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
with open(data_json_path) as token_data_file:
Expand All @@ -39,69 +40,76 @@ def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
token_values[x:x+sequence_length]).encode())
.hexdigest())[0:8] for x in range(0, num-sequence_length+1)]

if len(token_hashed_values) > lichen_config["max_sequences_per_file"]:
token_hashed_values = token_hashed_values[slice(0, lichen_config["max_sequences_per_file"])] # noqa E501
print(f"File {my_hashes_file} truncated after exceeding max sequence limit")

my_hf.write('\n'.join(token_hashed_values))


def main():
start_time = time.time()
args = parse_args()

with open(os.path.join(args.basepath, "config.json")) as lichen_config:
lichen_config_data = json.load(lichen_config)
with open(Path(args.basepath, "config.json")) as lichen_run_config_file:
lichen_run_config = json.load(lichen_run_config_file)

with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
lichen_config = json.load(lichen_config_file)

print("HASH ALL...", end="")

# ==========================================================================
# walk the subdirectories of this gradeable
users_dir = os.path.join(args.basepath, "users")
users_dir = Path(args.basepath, "users")
if not os.path.isdir(users_dir):
raise SystemExit("ERROR! Unable to find users directory")

for user in sorted(os.listdir(users_dir)):
user_dir = os.path.join(users_dir, user)
user_dir = Path(users_dir, user)
if not os.path.isdir(user_dir):
continue

for version in sorted(os.listdir(user_dir)):
my_dir = os.path.join(user_dir, version)
my_dir = Path(user_dir, version)
if not os.path.isdir(my_dir):
continue

my_tokenized_file = os.path.join(my_dir, "tokens.json")
my_hashes_file = os.path.join(my_dir, "hashes.txt")
hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
my_tokenized_file = Path(my_dir, "tokens.json")
my_hashes_file = Path(my_dir, "hashes.txt")
hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file)

# ==========================================================================
# walk the subdirectories of the other gradeables

other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
other_gradeables_dir = Path(args.basepath, "other_gradeables")
if not os.path.isdir(other_gradeables_dir):
raise SystemExit("ERROR! Unable to find other gradeables directory")

for other_gradeable in sorted(os.listdir(other_gradeables_dir)):
other_gradeable_dir = os.path.join(other_gradeables_dir, other_gradeable)
other_gradeable_dir = Path(other_gradeables_dir, other_gradeable)
if not os.path.isdir(other_gradeable_dir):
continue

for other_user in sorted(os.listdir(other_gradeable_dir)):
other_user_dir = os.path.join(other_gradeable_dir, other_user)
other_user_dir = Path(other_gradeable_dir, other_user)
if not os.path.isdir(other_user_dir):
continue

for other_version in sorted(os.listdir(other_user_dir)):
other_version_dir = os.path.join(other_user_dir, other_version)
other_version_dir = Path(other_user_dir, other_version)
if not os.path.isdir(other_version_dir):
continue

other_tokenized_file = os.path.join(other_version_dir, "tokens.json")
other_hashes_file = os.path.join(other_version_dir, "hashes.txt")
hasher(lichen_config_data, other_tokenized_file, other_hashes_file)
other_tokenized_file = Path(other_version_dir, "tokens.json")
other_hashes_file = Path(other_version_dir, "hashes.txt")
hasher(lichen_config, lichen_run_config, other_tokenized_file, other_hashes_file)

# ==========================================================================
# hash the provided code
provided_code_tokenized = os.path.join(args.basepath, "provided_code", "tokens.json")
provided_code_hashed = os.path.join(args.basepath, "provided_code", "hashes.txt")
hasher(lichen_config_data, provided_code_tokenized, provided_code_hashed)
provided_code_tokenized = Path(args.basepath, "provided_code", "tokens.json")
provided_code_hashed = Path(args.basepath, "provided_code", "hashes.txt")
hasher(lichen_config, lichen_run_config, provided_code_tokenized, provided_code_hashed)

# ==========================================================================
end_time = time.time()
Expand Down
5 changes: 5 additions & 0 deletions bin/lichen_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"concat_max_total_bytes": 1000000000,
"max_sequences_per_file": 10000,
"max_matching_positions": 30
}
66 changes: 40 additions & 26 deletions bin/process_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,57 +7,71 @@

# TODO: Assert permissions, as necessary

basepath=$1 # holds the path to a directory containing a config for this gradeable
BASEPATH=$1 # holds the path to a directory containing a config for this gradeable
# (probably .../lichen/gradeable/<unique number>/ on Submitty)

datapath=$2 # holds the path to a directory conatining courses and their data
DATAPATH=$2 # holds the path to a directory conatining courses and their data
# (probably /var/local/submitty/courses on Submitty)

KILL_ERROR_MESSAGE="
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* An error occured while running Lichen. Your run was probably killed for *
* exceeding the configured resource limits. Before rerunning, perhaps try any *
* of the following edits to the configuration: *
* - Increasing the sequence length *
* - Using only active version *
* - Decreasing the common code threshold *
* - Selecting fewer files to be compared *
* - Comparing against fewer other gradeables *
* - Uploading provided code files *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
";

# kill the script if there is no config file
if [ ! -f "${basepath}/config.json" ]; then
if [ ! -f "${BASEPATH}/config.json" ]; then
echo "Unable to find config.json in provided directory"
exit 1
fi


# delete any previous run results
# TODO: determine if any caching should occur
rm -rf "${basepath}/logs"
rm -rf "${basepath}/other_gradeables"
rm -rf "${basepath}/users"
rm -f "${basepath}/overall_ranking.txt"
rm -f "${basepath}/provided_code/submission.concatenated"
rm -f "${basepath}/provided_code/tokens.json"
rm -f "${basepath}/provided_code/hashes.txt"
rm -rf "${BASEPATH}/logs"
rm -rf "${BASEPATH}/other_gradeables"
rm -rf "${BASEPATH}/users"
rm -f "${BASEPATH}/overall_ranking.txt"
rm -f "${BASEPATH}/provided_code/submission.concatenated"
rm -f "${BASEPATH}/provided_code/tokens.json"
rm -f "${BASEPATH}/provided_code/hashes.txt"

# create these directories if they don't already exist
mkdir -p "${basepath}/logs"
mkdir -p "${basepath}/provided_code"
mkdir -p "${basepath}/provided_code/files"
mkdir -p "${basepath}/other_gradeables"
mkdir -p "${basepath}/users"
mkdir -p "${BASEPATH}/logs"
mkdir -p "${BASEPATH}/provided_code"
mkdir -p "${BASEPATH}/provided_code/files"
mkdir -p "${BASEPATH}/other_gradeables"
mkdir -p "${BASEPATH}/users"

# Run Lichen and exit if an error occurs
{
############################################################################
# Finish setting up Lichen run

# The default is r-x and we need PHP to be able to write if edits are made to the provided code
chmod g=rwxs "${basepath}/provided_code/files" || exit 1
chmod g=rwxs "${BASEPATH}/provided_code/files" || exit 1

cd "$(dirname "${0}")" || exit 1

############################################################################
# Do some preprocessing
echo "Beginning Lichen run: $(date +"%Y-%m-%d %H:%M:%S")"
./concatenate_all.py "$basepath" "$datapath" || exit 1
./concatenate_all.py "$BASEPATH" "$DATAPATH" || exit 1

############################################################################
# Move the file somewhere to be processed (eventually this will be a worker machine)

# Tar+zip the file structure and save it to /tmp
cd $basepath || exit 1
archive_name=$(sha1sum "${basepath}/config.json" | awk '{ print $1 }') || exit 1
cd $BASEPATH || exit 1
archive_name=$(sha1sum "${BASEPATH}/config.json" | awk '{ print $1 }') || exit 1
tar -czf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" "config.json" "other_gradeables" "users" "provided_code" || exit 1
cd "$(dirname "${0}")" || exit 1

Expand All @@ -71,21 +85,21 @@ mkdir -p "${basepath}/users"

############################################################################
# Run Lichen
./tokenize_all.py "$tmp_location" || { rm -rf $tmp_location; exit 1; }
./hash_all.py "$tmp_location" || { rm -rf $tmp_location; exit 1; }
./compare_hashes.out "$tmp_location" || { rm -rf $tmp_location; exit 1; }
./tokenize_all.py "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
./hash_all.py "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
./compare_hashes.out "$tmp_location" || { rm -rf "$tmp_location"; echo "${KILL_ERROR_MESSAGE}"; exit 1; }

############################################################################
# Zip the results back up and send them back to the course's lichen directory
cd $tmp_location || exit 1
tar -czf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" "."
rm -rf $tmp_location || exit 1
rm -rf "$tmp_location" || exit 1

# TODO: Move the archive back from worker machine

# Extract archive and restore Lichen file structure
cd $basepath || exit 1
tar --skip-old-files -xzf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" -C "$basepath"
cd "$BASEPATH" || exit 1
tar --skip-old-files -xzf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" -C "$BASEPATH"
rm "/tmp/LICHEN_JOB_${archive_name}.tar.gz" || exit 1

} >> "${basepath}/logs/lichen_job_output.txt" 2>&1
} >> "${BASEPATH}/logs/lichen_job_output.txt" 2>&1
21 changes: 17 additions & 4 deletions compare_hashes/compare_hashes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,22 @@ int main(int argc, char* argv[]) {
time_t overall_start, overall_end;
time(&overall_start);

// ===========================================================================
// load Lichen config data
std::ifstream lichen_config_istr("./lichen_config.json");
assert(lichen_config_istr.good());
nlohmann::json lichen_config = nlohmann::json::parse(lichen_config_istr);

// ===========================================================================
// load config info

assert (argc == 2);
assert(argc == 2);
std::string lichen_gradeable_path_str = argv[1];
boost::filesystem::path lichen_gradeable_path = boost::filesystem::system_complete(lichen_gradeable_path_str);
boost::filesystem::path config_file_json_path = lichen_gradeable_path / "config.json";

std::ifstream istr(config_file_json_path.string());
assert (istr.good());
assert(istr.good());
nlohmann::json config_file_json = nlohmann::json::parse(istr);

std::string semester = config_file_json.value("semester", "ERROR");
Expand Down Expand Up @@ -320,7 +326,7 @@ int main(int argc, char* argv[]) {
continue;
}

// Save this submissions highest percent match for later when we geenrate overall_rankings.txt
// Save this submissions highest percent match for later when we generate overall_rankings.txt
float percentMatch = (*submission_itr)->getPercentage();

std::unordered_map<std::string, std::pair<int, float> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
Expand Down Expand Up @@ -375,12 +381,19 @@ int main(int argc, char* argv[]) {
// keep iterating and editing the same object until a we get to a different submission
if (matching_positions_itr->student != other["username"]
|| matching_positions_itr->version != other["version"]
|| matching_positions_itr->source_gradeable != other["source_gradeable"]) {
|| matching_positions_itr->source_gradeable != other["source_gradeable"]
|| matchingpositions.size() >= lichen_config["max_matching_positions"]) {

// found a different one, we push the old one and start over
other["matchingpositions"] = matchingpositions;
others.push_back(other);

if (matchingpositions.size() >= lichen_config["max_matching_positions"]) {
std::cout << "Matching positions array truncated for user: [" << other["username"] << "] version: " << other["version"] << std::endl;
std::cout << " - Try increasing the sequence length to fix this problem." << std::endl;
break;
}

matchingpositions.clear();
other["username"] = matching_positions_itr->student;
other["version"] = matching_positions_itr->version;
Expand Down