From c28bf90d94dc6ad811b877e6564140c5c91873fb Mon Sep 17 00:00:00 2001 From: Barb Cutler Date: Mon, 4 Jun 2018 10:25:27 -0400 Subject: [PATCH 1/2] initial plaintext code --- .gitignore | 1 + install.sh | 25 ++ .../plaintext/expected_output/output.json | 230 ++++++++++++++++++ .../output_ignore_everything.json | 122 ++++++++++ .../output_ignore_newlines.json | 194 +++++++++++++++ .../output_ignore_punctuation.json | 176 ++++++++++++++ .../expected_output/output_to_lower.json | 230 ++++++++++++++++++ tokenizer/plaintext/input.txt | 6 + tokenizer/plaintext/plaintext_tokenizer.cpp | 183 ++++++++++++++ 9 files changed, 1167 insertions(+) create mode 100644 .gitignore create mode 100755 install.sh create mode 100644 tokenizer/plaintext/expected_output/output.json create mode 100644 tokenizer/plaintext/expected_output/output_ignore_everything.json create mode 100644 tokenizer/plaintext/expected_output/output_ignore_newlines.json create mode 100644 tokenizer/plaintext/expected_output/output_ignore_punctuation.json create mode 100644 tokenizer/plaintext/expected_output/output_to_lower.json create mode 100644 tokenizer/plaintext/input.txt create mode 100644 tokenizer/plaintext/plaintext_tokenizer.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e4e5f6c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ \ No newline at end of file diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..c7115df --- /dev/null +++ b/install.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +src_location="." +build_location="." +bin_location="./bin" + +nlohmann_dir=${src_location}/GIT_NLOHMANN_JSON/ + +if [ ! -d "${nlohmann_dir}" ]; then + echo 'should install' + git clone --depth 1 https://github.com/nlohmann/json.git ${nlohmann_dir} +fi + + +mkdir -p ${bin_location} +clang++ -I ${nlohmann_dir}/include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${bin_location}/plaintext_tokenizer.out + +${bin_location}/plaintext_tokenizer.out < tokenizer/plaintext/input.txt > output.json +${bin_location}/plaintext_tokenizer.out --ignore_newlines < tokenizer/plaintext/input.txt > output_ignore_newlines.json +${bin_location}/plaintext_tokenizer.out --to_lower < tokenizer/plaintext/input.txt > output_to_lower.json +${bin_location}/plaintext_tokenizer.out --ignore_punctuation < tokenizer/plaintext/input.txt > output_ignore_punctuation.json +${bin_location}/plaintext_tokenizer.out --ignore_punctuation --ignore_numbers --ignore_newlines --to_lower < tokenizer/plaintext/input.txt > output_ignore_everything.json + + + diff --git a/tokenizer/plaintext/expected_output/output.json b/tokenizer/plaintext/expected_output/output.json new file mode 100644 index 0000000..0a04cad --- /dev/null +++ b/tokenizer/plaintext/expected_output/output.json @@ -0,0 +1,230 @@ +[ + { + "char": 5, + "line": 1, + "type": "string", + "value": "A" + }, + { + "char": 7, + "line": 1, + "type": "string", + "value": "Sample" + }, + { + "char": 14, + "line": 1, + "type": "string", + "value": "File" + }, + { + "char": 18, + "line": 1, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 2, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 3, + "type": "string", + "value": "This" + }, + { + "char": 6, + "line": 3, + "type": "string", + "value": "file" + }, + { + "char": 11, + "line": 3, + "type": "string", + "value": "contains" + }, + { + "char": 20, + "line": 3, + "type": "number", + "value": "1" + }, + { + "char": 22, + "line": 3, + "type": "punctuation", + "value": "\"" + }, + { + "char": 23, + "line": 3, + "type": "string", + "value": "sample" + }, + { + "char": 30, + "line": 3, + "type": "string", + "value": "of" + }, + { + "char": 33, + "line": 3, + "type": "string", + "value": "plaintext" + }, + { + "char": 42, + "line": 3, + "type": "punctuation", + "value": "\"" + }, + { + "char": 43, + "line": 3, + "type": "punctuation", + "value": "." + }, + { + "char": 46, + "line": 3, + "type": "string", + "value": "We" + }, + { + "char": 48, + "line": 3, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 4, + "type": "string", + "value": "can" + }, + { + "char": 5, + "line": 4, + "type": "string", + "value": "tokenize" + }, + { + "char": 14, + "line": 4, + "type": "string", + "value": "THIS" + }, + { + "char": 18, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 21, + "line": 4, + "type": "string", + "value": "a" + }, + { + "char": 22, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 23, + "line": 4, + "type": "string", + "value": "b" + }, + { + "char": 24, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 25, + "line": 4, + "type": "string", + "value": "c" + }, + { + "char": 26, + "line": 4, + "type": "number", + "value": "1" + }, + { + "char": 27, + "line": 4, + "type": "string", + "value": "d" + }, + { + "char": 28, + "line": 4, + "type": "number", + "value": "2" + }, + { + "char": 29, + "line": 4, + "type": "string", + "value": "e" + }, + { + "char": 30, + "line": 4, + "type": "punctuation", + "value": "!" + }, + { + "char": 31, + "line": 4, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 5, + "type": "string", + "value": "Good" + }, + { + "char": 5, + "line": 5, + "type": "punctuation", + "value": "-" + }, + { + "char": 6, + "line": 5, + "type": "string", + "value": "bye" + }, + { + "char": 9, + "line": 5, + "type": "punctuation", + "value": "." + }, + { + "char": 10, + "line": 5, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 6, + "type": "newline", + "value": "\n" + } +] diff --git a/tokenizer/plaintext/expected_output/output_ignore_everything.json b/tokenizer/plaintext/expected_output/output_ignore_everything.json new file mode 100644 index 0000000..86c0037 --- /dev/null +++ b/tokenizer/plaintext/expected_output/output_ignore_everything.json @@ -0,0 +1,122 @@ +[ + { + "char": 5, + "line": 1, + "type": "string", + "value": "a" + }, + { + "char": 7, + "line": 1, + "type": "string", + "value": "sample" + }, + { + "char": 14, + "line": 1, + "type": "string", + "value": "file" + }, + { + "char": 1, + "line": 3, + "type": "string", + "value": "this" + }, + { + "char": 6, + "line": 3, + "type": "string", + "value": "file" + }, + { + "char": 11, + "line": 3, + "type": "string", + "value": "contains" + }, + { + "char": 23, + "line": 3, + "type": "string", + "value": "sample" + }, + { + "char": 30, + "line": 3, + "type": "string", + "value": "of" + }, + { + "char": 33, + "line": 3, + "type": "string", + "value": "plaintext" + }, + { + "char": 46, + "line": 3, + "type": "string", + "value": "we" + }, + { + "char": 1, + "line": 4, + "type": "string", + "value": "can" + }, + { + "char": 5, + "line": 4, + "type": "string", + "value": "tokenize" + }, + { + "char": 14, + "line": 4, + "type": "string", + "value": "this" + }, + { + "char": 21, + "line": 4, + "type": "string", + "value": "a" + }, + { + "char": 23, + "line": 4, + "type": "string", + "value": "b" + }, + { + "char": 25, + "line": 4, + "type": "string", + "value": "c" + }, + { + "char": 27, + "line": 4, + "type": "string", + "value": "d" + }, + { + "char": 29, + "line": 4, + "type": "string", + "value": "e" + }, + { + "char": 1, + "line": 5, + "type": "string", + "value": "good" + }, + { + "char": 6, + "line": 5, + "type": "string", + "value": "bye" + } +] diff --git a/tokenizer/plaintext/expected_output/output_ignore_newlines.json b/tokenizer/plaintext/expected_output/output_ignore_newlines.json new file mode 100644 index 0000000..35f4422 --- /dev/null +++ b/tokenizer/plaintext/expected_output/output_ignore_newlines.json @@ -0,0 +1,194 @@ +[ + { + "char": 5, + "line": 1, + "type": "string", + "value": "A" + }, + { + "char": 7, + "line": 1, + "type": "string", + "value": "Sample" + }, + { + "char": 14, + "line": 1, + "type": "string", + "value": "File" + }, + { + "char": 1, + "line": 3, + "type": "string", + "value": "This" + }, + { + "char": 6, + "line": 3, + "type": "string", + "value": "file" + }, + { + "char": 11, + "line": 3, + "type": "string", + "value": "contains" + }, + { + "char": 20, + "line": 3, + "type": "number", + "value": "1" + }, + { + "char": 22, + "line": 3, + "type": "punctuation", + "value": "\"" + }, + { + "char": 23, + "line": 3, + "type": "string", + "value": "sample" + }, + { + "char": 30, + "line": 3, + "type": "string", + "value": "of" + }, + { + "char": 33, + "line": 3, + "type": "string", + "value": "plaintext" + }, + { + "char": 42, + "line": 3, + "type": "punctuation", + "value": "\"" + }, + { + "char": 43, + "line": 3, + "type": "punctuation", + "value": "." + }, + { + "char": 46, + "line": 3, + "type": "string", + "value": "We" + }, + { + "char": 1, + "line": 4, + "type": "string", + "value": "can" + }, + { + "char": 5, + "line": 4, + "type": "string", + "value": "tokenize" + }, + { + "char": 14, + "line": 4, + "type": "string", + "value": "THIS" + }, + { + "char": 18, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 21, + "line": 4, + "type": "string", + "value": "a" + }, + { + "char": 22, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 23, + "line": 4, + "type": "string", + "value": "b" + }, + { + "char": 24, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 25, + "line": 4, + "type": "string", + "value": "c" + }, + { + "char": 26, + "line": 4, + "type": "number", + "value": "1" + }, + { + "char": 27, + "line": 4, + "type": "string", + "value": "d" + }, + { + "char": 28, + "line": 4, + "type": "number", + "value": "2" + }, + { + "char": 29, + "line": 4, + "type": "string", + "value": "e" + }, + { + "char": 30, + "line": 4, + "type": "punctuation", + "value": "!" + }, + { + "char": 1, + "line": 5, + "type": "string", + "value": "Good" + }, + { + "char": 5, + "line": 5, + "type": "punctuation", + "value": "-" + }, + { + "char": 6, + "line": 5, + "type": "string", + "value": "bye" + }, + { + "char": 9, + "line": 5, + "type": "punctuation", + "value": "." + } +] diff --git a/tokenizer/plaintext/expected_output/output_ignore_punctuation.json b/tokenizer/plaintext/expected_output/output_ignore_punctuation.json new file mode 100644 index 0000000..341d794 --- /dev/null +++ b/tokenizer/plaintext/expected_output/output_ignore_punctuation.json @@ -0,0 +1,176 @@ +[ + { + "char": 5, + "line": 1, + "type": "string", + "value": "A" + }, + { + "char": 7, + "line": 1, + "type": "string", + "value": "Sample" + }, + { + "char": 14, + "line": 1, + "type": "string", + "value": "File" + }, + { + "char": 18, + "line": 1, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 2, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 3, + "type": "string", + "value": "This" + }, + { + "char": 6, + "line": 3, + "type": "string", + "value": "file" + }, + { + "char": 11, + "line": 3, + "type": "string", + "value": "contains" + }, + { + "char": 20, + "line": 3, + "type": "number", + "value": "1" + }, + { + "char": 23, + "line": 3, + "type": "string", + "value": "sample" + }, + { + "char": 30, + "line": 3, + "type": "string", + "value": "of" + }, + { + "char": 33, + "line": 3, + "type": "string", + "value": "plaintext" + }, + { + "char": 46, + "line": 3, + "type": "string", + "value": "We" + }, + { + "char": 48, + "line": 3, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 4, + "type": "string", + "value": "can" + }, + { + "char": 5, + "line": 4, + "type": "string", + "value": "tokenize" + }, + { + "char": 14, + "line": 4, + "type": "string", + "value": "THIS" + }, + { + "char": 21, + "line": 4, + "type": "string", + "value": "a" + }, + { + "char": 23, + "line": 4, + "type": "string", + "value": "b" + }, + { + "char": 25, + "line": 4, + "type": "string", + "value": "c" + }, + { + "char": 26, + "line": 4, + "type": "number", + "value": "1" + }, + { + "char": 27, + "line": 4, + "type": "string", + "value": "d" + }, + { + "char": 28, + "line": 4, + "type": "number", + "value": "2" + }, + { + "char": 29, + "line": 4, + "type": "string", + "value": "e" + }, + { + "char": 31, + "line": 4, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 5, + "type": "string", + "value": "Good" + }, + { + "char": 6, + "line": 5, + "type": "string", + "value": "bye" + }, + { + "char": 10, + "line": 5, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 6, + "type": "newline", + "value": "\n" + } +] diff --git a/tokenizer/plaintext/expected_output/output_to_lower.json b/tokenizer/plaintext/expected_output/output_to_lower.json new file mode 100644 index 0000000..14b0da1 --- /dev/null +++ b/tokenizer/plaintext/expected_output/output_to_lower.json @@ -0,0 +1,230 @@ +[ + { + "char": 5, + "line": 1, + "type": "string", + "value": "a" + }, + { + "char": 7, + "line": 1, + "type": "string", + "value": "sample" + }, + { + "char": 14, + "line": 1, + "type": "string", + "value": "file" + }, + { + "char": 18, + "line": 1, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 2, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 3, + "type": "string", + "value": "this" + }, + { + "char": 6, + "line": 3, + "type": "string", + "value": "file" + }, + { + "char": 11, + "line": 3, + "type": "string", + "value": "contains" + }, + { + "char": 20, + "line": 3, + "type": "number", + "value": "1" + }, + { + "char": 22, + "line": 3, + "type": "punctuation", + "value": "\"" + }, + { + "char": 23, + "line": 3, + "type": "string", + "value": "sample" + }, + { + "char": 30, + "line": 3, + "type": "string", + "value": "of" + }, + { + "char": 33, + "line": 3, + "type": "string", + "value": "plaintext" + }, + { + "char": 42, + "line": 3, + "type": "punctuation", + "value": "\"" + }, + { + "char": 43, + "line": 3, + "type": "punctuation", + "value": "." + }, + { + "char": 46, + "line": 3, + "type": "string", + "value": "we" + }, + { + "char": 48, + "line": 3, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 4, + "type": "string", + "value": "can" + }, + { + "char": 5, + "line": 4, + "type": "string", + "value": "tokenize" + }, + { + "char": 14, + "line": 4, + "type": "string", + "value": "this" + }, + { + "char": 18, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 21, + "line": 4, + "type": "string", + "value": "a" + }, + { + "char": 22, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 23, + "line": 4, + "type": "string", + "value": "b" + }, + { + "char": 24, + "line": 4, + "type": "punctuation", + "value": "." + }, + { + "char": 25, + "line": 4, + "type": "string", + "value": "c" + }, + { + "char": 26, + "line": 4, + "type": "number", + "value": "1" + }, + { + "char": 27, + "line": 4, + "type": "string", + "value": "d" + }, + { + "char": 28, + "line": 4, + "type": "number", + "value": "2" + }, + { + "char": 29, + "line": 4, + "type": "string", + "value": "e" + }, + { + "char": 30, + "line": 4, + "type": "punctuation", + "value": "!" + }, + { + "char": 31, + "line": 4, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 5, + "type": "string", + "value": "good" + }, + { + "char": 5, + "line": 5, + "type": "punctuation", + "value": "-" + }, + { + "char": 6, + "line": 5, + "type": "string", + "value": "bye" + }, + { + "char": 9, + "line": 5, + "type": "punctuation", + "value": "." + }, + { + "char": 10, + "line": 5, + "type": "newline", + "value": "\n" + }, + { + "char": 1, + "line": 6, + "type": "newline", + "value": "\n" + } +] diff --git a/tokenizer/plaintext/input.txt b/tokenizer/plaintext/input.txt new file mode 100644 index 0000000..4c059cf --- /dev/null +++ b/tokenizer/plaintext/input.txt @@ -0,0 +1,6 @@ + A Sample File + +This file contains 1 "sample of plaintext". We +can tokenize THIS. a.b.c1d2e! +Good-bye. + diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp new file mode 100644 index 0000000..ba54107 --- /dev/null +++ b/tokenizer/plaintext/plaintext_tokenizer.cpp @@ -0,0 +1,183 @@ +#include +#include +#include +#include +#include +#include +#include +#include "nlohmann/json.hpp" + +void usage(const std::string &program) { + std::cerr << "Usage: " << program << " [--ignore_punctuation] [--to_lower] [--ignore_numbers] [--ignore_newlines] < INPUT_FILE.txt > OUTPUT_FILE.json" << std::endl; + exit(0); +} + + +int main(int argc, char* argv[]) { + + // ------------------------------ + // handle arguments + bool ignore_punctuation = false; + bool to_lower = false; + bool ignore_numbers = false; + bool ignore_newlines = false; + for (int i = 1; i < argc; i++) { + if (std::string(argv[i]) == std::string("--ignore_punctuation")) { + ignore_punctuation = true; + } else if (std::string(argv[i]) == std::string("--to_lower")) { + to_lower = true; + } else if (std::string(argv[i]) == std::string("--ignore_numbers")) { + ignore_numbers = true; + } else if (std::string(argv[i]) == std::string("--ignore_newlines")) { + ignore_newlines = true; + } else { + std::cerr << "ERROR: Unknown option '" << argv[i] << "'" << std::endl; + usage(argv[0]); + } + } + + // ------------------------------ + // helper variables + nlohmann::json tokens; + char c; + std::string token; + int row = 1; + int col = 1; + int start_row = -1; + int start_col = -1; + bool last_was_alpha = false; + bool last_was_digit = false; + + // ------------------------------ + // loop to read the input file file + while (std::cin >> std::noskipws >> c) { + bool is_punctuation = !isspace(c) && !std::isdigit(c) && !std::isalpha(c); + + // ------------------------------ + // decide when to break the current string + // break on spaces, punctuation (any symbol), or if we switch between letters and numbers + if (isspace(c) || + is_punctuation || + (last_was_alpha && std::isdigit(c)) || + (last_was_digit && std::isalpha(c))) { + if (token != "") { + // save this token! + std::map tmp; + tmp["line"]=start_row; + tmp["char"]=start_col; + if (last_was_digit) { + assert (!last_was_alpha); + tmp["type"]="number"; + tmp["value"]=std::stoi(token); + } else { + assert (last_was_alpha); + tmp["type"]="string"; + tmp["value"]=token; + } + tmp["value"]=token; + tokens.push_back(tmp); + token=""; + last_was_alpha = false; + last_was_digit = false; + } + } + + // ------------------------------ + // decide whether to add this character to the current string + if (isspace(c)) { + // never add spaces + } + // ------------------------------ + else if (is_punctuation) { + assert (token == ""); + assert (last_was_alpha == false); + assert (last_was_digit == false); + // only add punctuation if its not being ignored + // (punctuation is always a single symbol character per token) + if (!ignore_punctuation) { + std::map tmp; + tmp["line"]=row; + tmp["char"]=col; + tmp["type"]="punctuation"; + tmp["value"]=std::string(1,c); + tokens.push_back(tmp); + } + } + // ------------------------------ + else if (std::isdigit(c)) { + assert (last_was_alpha == false); + // only add digits/numbers if they are not being ignored + // numbers will be 1 or more digits 0-9. + // We break tokens between letters & numbers. + if (!ignore_numbers) { + if (token=="") { + start_row = row; + start_col = col; + last_was_digit=true; + } else { + assert (last_was_digit == true); + } + token.push_back(c); + } + } + // ------------------------------ + else if (isalpha(c)) { + assert (last_was_digit == false); + // string tokens will be 1 or more letters a-z or A-Z + if (token=="") { + start_row = row; + start_col = col; + last_was_alpha=true; + } else { + assert (last_was_alpha == true); + } + // option to lowercase + if (to_lower) { + c = std::tolower(c); + } + token.push_back(c); + } + + // ------------------------------ + if (c == '\n') { + // advance to the next row/line + assert (token==""); + if (!ignore_newlines) { + // output a token for the newline + std::map tmp; + tmp["line"]=row; + tmp["char"]=col; + tmp["type"]="newline"; + tmp["value"]="\n"; + tokens.push_back(tmp); + } + row++; + col=1; + } else { + // advance to the next column/character + col++; + } + } + + // ------------------------------ + if (token != "") { + // save the last token (if there was no space or newline at the end of the file) + std::map tmp; + tmp["line"]=start_row; + tmp["char"]=start_col; + if (last_was_digit) { + assert (!last_was_alpha); + tmp["type"]="number"; + tmp["value"]=std::stoi(token); + } else { + assert (last_was_alpha); + tmp["type"]="string"; + tmp["value"]=token; + } + tokens.push_back(tmp); + } + + // ------------------------------ + // export/save in json format + std::cout << tokens.dump(4) << std::endl; +} From 3fd9ae01f3273b1a806b3fe208d42574261d4a85 Mon Sep 17 00:00:00 2001 From: Barb Cutler Date: Mon, 4 Jun 2018 14:42:48 -0400 Subject: [PATCH 2/2] duplicate line preventing number values --- tokenizer/plaintext/plaintext_tokenizer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp index ba54107..8fd793d 100644 --- a/tokenizer/plaintext/plaintext_tokenizer.cpp +++ b/tokenizer/plaintext/plaintext_tokenizer.cpp @@ -74,7 +74,6 @@ int main(int argc, char* argv[]) { tmp["type"]="string"; tmp["value"]=token; } - tmp["value"]=token; tokens.push_back(tmp); token=""; last_was_alpha = false;