From c28bf90d94dc6ad811b877e6564140c5c91873fb Mon Sep 17 00:00:00 2001
From: Barb Cutler <bmcutler@gmail.com>
Date: Mon, 4 Jun 2018 10:25:27 -0400
Subject: [PATCH 1/2] initial plaintext code

---
 .gitignore                                    |   1 +
 install.sh                                    |  25 ++
 .../plaintext/expected_output/output.json     | 230 ++++++++++++++++++
 .../output_ignore_everything.json             | 122 ++++++++++
 .../output_ignore_newlines.json               | 194 +++++++++++++++
 .../output_ignore_punctuation.json            | 176 ++++++++++++++
 .../expected_output/output_to_lower.json      | 230 ++++++++++++++++++
 tokenizer/plaintext/input.txt                 |   6 +
 tokenizer/plaintext/plaintext_tokenizer.cpp   | 183 ++++++++++++++
 9 files changed, 1167 insertions(+)
 create mode 100644 .gitignore
 create mode 100755 install.sh
 create mode 100644 tokenizer/plaintext/expected_output/output.json
 create mode 100644 tokenizer/plaintext/expected_output/output_ignore_everything.json
 create mode 100644 tokenizer/plaintext/expected_output/output_ignore_newlines.json
 create mode 100644 tokenizer/plaintext/expected_output/output_ignore_punctuation.json
 create mode 100644 tokenizer/plaintext/expected_output/output_to_lower.json
 create mode 100644 tokenizer/plaintext/input.txt
 create mode 100644 tokenizer/plaintext/plaintext_tokenizer.cpp

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e4e5f6c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*~
\ No newline at end of file
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..c7115df
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+src_location="."
+build_location="."
+bin_location="./bin"
+
+nlohmann_dir=${src_location}/GIT_NLOHMANN_JSON/
+
+if [ ! -d "${nlohmann_dir}" ]; then
+    echo 'should install'
+    git clone --depth 1 https://github.com/nlohmann/json.git ${nlohmann_dir}
+fi
+
+
+mkdir -p ${bin_location}
+clang++ -I ${nlohmann_dir}/include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${bin_location}/plaintext_tokenizer.out
+
+${bin_location}/plaintext_tokenizer.out                                                                    < tokenizer/plaintext/input.txt > output.json
+${bin_location}/plaintext_tokenizer.out --ignore_newlines                                                  < tokenizer/plaintext/input.txt > output_ignore_newlines.json
+${bin_location}/plaintext_tokenizer.out --to_lower                                                         < tokenizer/plaintext/input.txt > output_to_lower.json
+${bin_location}/plaintext_tokenizer.out --ignore_punctuation                                               < tokenizer/plaintext/input.txt > output_ignore_punctuation.json
+${bin_location}/plaintext_tokenizer.out --ignore_punctuation --ignore_numbers --ignore_newlines --to_lower < tokenizer/plaintext/input.txt > output_ignore_everything.json
+
+
+
diff --git a/tokenizer/plaintext/expected_output/output.json b/tokenizer/plaintext/expected_output/output.json
new file mode 100644
index 0000000..0a04cad
--- /dev/null
+++ b/tokenizer/plaintext/expected_output/output.json
@@ -0,0 +1,230 @@
+[
+    {
+        "char": 5,
+        "line": 1,
+        "type": "string",
+        "value": "A"
+    },
+    {
+        "char": 7,
+        "line": 1,
+        "type": "string",
+        "value": "Sample"
+    },
+    {
+        "char": 14,
+        "line": 1,
+        "type": "string",
+        "value": "File"
+    },
+    {
+        "char": 18,
+        "line": 1,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 2,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 3,
+        "type": "string",
+        "value": "This"
+    },
+    {
+        "char": 6,
+        "line": 3,
+        "type": "string",
+        "value": "file"
+    },
+    {
+        "char": 11,
+        "line": 3,
+        "type": "string",
+        "value": "contains"
+    },
+    {
+        "char": 20,
+        "line": 3,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 22,
+        "line": 3,
+        "type": "punctuation",
+        "value": "\""
+    },
+    {
+        "char": 23,
+        "line": 3,
+        "type": "string",
+        "value": "sample"
+    },
+    {
+        "char": 30,
+        "line": 3,
+        "type": "string",
+        "value": "of"
+    },
+    {
+        "char": 33,
+        "line": 3,
+        "type": "string",
+        "value": "plaintext"
+    },
+    {
+        "char": 42,
+        "line": 3,
+        "type": "punctuation",
+        "value": "\""
+    },
+    {
+        "char": 43,
+        "line": 3,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 46,
+        "line": 3,
+        "type": "string",
+        "value": "We"
+    },
+    {
+        "char": 48,
+        "line": 3,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "string",
+        "value": "can"
+    },
+    {
+        "char": 5,
+        "line": 4,
+        "type": "string",
+        "value": "tokenize"
+    },
+    {
+        "char": 14,
+        "line": 4,
+        "type": "string",
+        "value": "THIS"
+    },
+    {
+        "char": 18,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 21,
+        "line": 4,
+        "type": "string",
+        "value": "a"
+    },
+    {
+        "char": 22,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 23,
+        "line": 4,
+        "type": "string",
+        "value": "b"
+    },
+    {
+        "char": 24,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 25,
+        "line": 4,
+        "type": "string",
+        "value": "c"
+    },
+    {
+        "char": 26,
+        "line": 4,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 27,
+        "line": 4,
+        "type": "string",
+        "value": "d"
+    },
+    {
+        "char": 28,
+        "line": 4,
+        "type": "number",
+        "value": "2"
+    },
+    {
+        "char": 29,
+        "line": 4,
+        "type": "string",
+        "value": "e"
+    },
+    {
+        "char": 30,
+        "line": 4,
+        "type": "punctuation",
+        "value": "!"
+    },
+    {
+        "char": 31,
+        "line": 4,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 5,
+        "type": "string",
+        "value": "Good"
+    },
+    {
+        "char": 5,
+        "line": 5,
+        "type": "punctuation",
+        "value": "-"
+    },
+    {
+        "char": 6,
+        "line": 5,
+        "type": "string",
+        "value": "bye"
+    },
+    {
+        "char": 9,
+        "line": 5,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 10,
+        "line": 5,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 6,
+        "type": "newline",
+        "value": "\n"
+    }
+]
diff --git a/tokenizer/plaintext/expected_output/output_ignore_everything.json b/tokenizer/plaintext/expected_output/output_ignore_everything.json
new file mode 100644
index 0000000..86c0037
--- /dev/null
+++ b/tokenizer/plaintext/expected_output/output_ignore_everything.json
@@ -0,0 +1,122 @@
+[
+    {
+        "char": 5,
+        "line": 1,
+        "type": "string",
+        "value": "a"
+    },
+    {
+        "char": 7,
+        "line": 1,
+        "type": "string",
+        "value": "sample"
+    },
+    {
+        "char": 14,
+        "line": 1,
+        "type": "string",
+        "value": "file"
+    },
+    {
+        "char": 1,
+        "line": 3,
+        "type": "string",
+        "value": "this"
+    },
+    {
+        "char": 6,
+        "line": 3,
+        "type": "string",
+        "value": "file"
+    },
+    {
+        "char": 11,
+        "line": 3,
+        "type": "string",
+        "value": "contains"
+    },
+    {
+        "char": 23,
+        "line": 3,
+        "type": "string",
+        "value": "sample"
+    },
+    {
+        "char": 30,
+        "line": 3,
+        "type": "string",
+        "value": "of"
+    },
+    {
+        "char": 33,
+        "line": 3,
+        "type": "string",
+        "value": "plaintext"
+    },
+    {
+        "char": 46,
+        "line": 3,
+        "type": "string",
+        "value": "we"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "string",
+        "value": "can"
+    },
+    {
+        "char": 5,
+        "line": 4,
+        "type": "string",
+        "value": "tokenize"
+    },
+    {
+        "char": 14,
+        "line": 4,
+        "type": "string",
+        "value": "this"
+    },
+    {
+        "char": 21,
+        "line": 4,
+        "type": "string",
+        "value": "a"
+    },
+    {
+        "char": 23,
+        "line": 4,
+        "type": "string",
+        "value": "b"
+    },
+    {
+        "char": 25,
+        "line": 4,
+        "type": "string",
+        "value": "c"
+    },
+    {
+        "char": 27,
+        "line": 4,
+        "type": "string",
+        "value": "d"
+    },
+    {
+        "char": 29,
+        "line": 4,
+        "type": "string",
+        "value": "e"
+    },
+    {
+        "char": 1,
+        "line": 5,
+        "type": "string",
+        "value": "good"
+    },
+    {
+        "char": 6,
+        "line": 5,
+        "type": "string",
+        "value": "bye"
+    }
+]
diff --git a/tokenizer/plaintext/expected_output/output_ignore_newlines.json b/tokenizer/plaintext/expected_output/output_ignore_newlines.json
new file mode 100644
index 0000000..35f4422
--- /dev/null
+++ b/tokenizer/plaintext/expected_output/output_ignore_newlines.json
@@ -0,0 +1,194 @@
+[
+    {
+        "char": 5,
+        "line": 1,
+        "type": "string",
+        "value": "A"
+    },
+    {
+        "char": 7,
+        "line": 1,
+        "type": "string",
+        "value": "Sample"
+    },
+    {
+        "char": 14,
+        "line": 1,
+        "type": "string",
+        "value": "File"
+    },
+    {
+        "char": 1,
+        "line": 3,
+        "type": "string",
+        "value": "This"
+    },
+    {
+        "char": 6,
+        "line": 3,
+        "type": "string",
+        "value": "file"
+    },
+    {
+        "char": 11,
+        "line": 3,
+        "type": "string",
+        "value": "contains"
+    },
+    {
+        "char": 20,
+        "line": 3,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 22,
+        "line": 3,
+        "type": "punctuation",
+        "value": "\""
+    },
+    {
+        "char": 23,
+        "line": 3,
+        "type": "string",
+        "value": "sample"
+    },
+    {
+        "char": 30,
+        "line": 3,
+        "type": "string",
+        "value": "of"
+    },
+    {
+        "char": 33,
+        "line": 3,
+        "type": "string",
+        "value": "plaintext"
+    },
+    {
+        "char": 42,
+        "line": 3,
+        "type": "punctuation",
+        "value": "\""
+    },
+    {
+        "char": 43,
+        "line": 3,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 46,
+        "line": 3,
+        "type": "string",
+        "value": "We"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "string",
+        "value": "can"
+    },
+    {
+        "char": 5,
+        "line": 4,
+        "type": "string",
+        "value": "tokenize"
+    },
+    {
+        "char": 14,
+        "line": 4,
+        "type": "string",
+        "value": "THIS"
+    },
+    {
+        "char": 18,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 21,
+        "line": 4,
+        "type": "string",
+        "value": "a"
+    },
+    {
+        "char": 22,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 23,
+        "line": 4,
+        "type": "string",
+        "value": "b"
+    },
+    {
+        "char": 24,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 25,
+        "line": 4,
+        "type": "string",
+        "value": "c"
+    },
+    {
+        "char": 26,
+        "line": 4,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 27,
+        "line": 4,
+        "type": "string",
+        "value": "d"
+    },
+    {
+        "char": 28,
+        "line": 4,
+        "type": "number",
+        "value": "2"
+    },
+    {
+        "char": 29,
+        "line": 4,
+        "type": "string",
+        "value": "e"
+    },
+    {
+        "char": 30,
+        "line": 4,
+        "type": "punctuation",
+        "value": "!"
+    },
+    {
+        "char": 1,
+        "line": 5,
+        "type": "string",
+        "value": "Good"
+    },
+    {
+        "char": 5,
+        "line": 5,
+        "type": "punctuation",
+        "value": "-"
+    },
+    {
+        "char": 6,
+        "line": 5,
+        "type": "string",
+        "value": "bye"
+    },
+    {
+        "char": 9,
+        "line": 5,
+        "type": "punctuation",
+        "value": "."
+    }
+]
diff --git a/tokenizer/plaintext/expected_output/output_ignore_punctuation.json b/tokenizer/plaintext/expected_output/output_ignore_punctuation.json
new file mode 100644
index 0000000..341d794
--- /dev/null
+++ b/tokenizer/plaintext/expected_output/output_ignore_punctuation.json
@@ -0,0 +1,176 @@
+[
+    {
+        "char": 5,
+        "line": 1,
+        "type": "string",
+        "value": "A"
+    },
+    {
+        "char": 7,
+        "line": 1,
+        "type": "string",
+        "value": "Sample"
+    },
+    {
+        "char": 14,
+        "line": 1,
+        "type": "string",
+        "value": "File"
+    },
+    {
+        "char": 18,
+        "line": 1,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 2,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 3,
+        "type": "string",
+        "value": "This"
+    },
+    {
+        "char": 6,
+        "line": 3,
+        "type": "string",
+        "value": "file"
+    },
+    {
+        "char": 11,
+        "line": 3,
+        "type": "string",
+        "value": "contains"
+    },
+    {
+        "char": 20,
+        "line": 3,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 23,
+        "line": 3,
+        "type": "string",
+        "value": "sample"
+    },
+    {
+        "char": 30,
+        "line": 3,
+        "type": "string",
+        "value": "of"
+    },
+    {
+        "char": 33,
+        "line": 3,
+        "type": "string",
+        "value": "plaintext"
+    },
+    {
+        "char": 46,
+        "line": 3,
+        "type": "string",
+        "value": "We"
+    },
+    {
+        "char": 48,
+        "line": 3,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "string",
+        "value": "can"
+    },
+    {
+        "char": 5,
+        "line": 4,
+        "type": "string",
+        "value": "tokenize"
+    },
+    {
+        "char": 14,
+        "line": 4,
+        "type": "string",
+        "value": "THIS"
+    },
+    {
+        "char": 21,
+        "line": 4,
+        "type": "string",
+        "value": "a"
+    },
+    {
+        "char": 23,
+        "line": 4,
+        "type": "string",
+        "value": "b"
+    },
+    {
+        "char": 25,
+        "line": 4,
+        "type": "string",
+        "value": "c"
+    },
+    {
+        "char": 26,
+        "line": 4,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 27,
+        "line": 4,
+        "type": "string",
+        "value": "d"
+    },
+    {
+        "char": 28,
+        "line": 4,
+        "type": "number",
+        "value": "2"
+    },
+    {
+        "char": 29,
+        "line": 4,
+        "type": "string",
+        "value": "e"
+    },
+    {
+        "char": 31,
+        "line": 4,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 5,
+        "type": "string",
+        "value": "Good"
+    },
+    {
+        "char": 6,
+        "line": 5,
+        "type": "string",
+        "value": "bye"
+    },
+    {
+        "char": 10,
+        "line": 5,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 6,
+        "type": "newline",
+        "value": "\n"
+    }
+]
diff --git a/tokenizer/plaintext/expected_output/output_to_lower.json b/tokenizer/plaintext/expected_output/output_to_lower.json
new file mode 100644
index 0000000..14b0da1
--- /dev/null
+++ b/tokenizer/plaintext/expected_output/output_to_lower.json
@@ -0,0 +1,230 @@
+[
+    {
+        "char": 5,
+        "line": 1,
+        "type": "string",
+        "value": "a"
+    },
+    {
+        "char": 7,
+        "line": 1,
+        "type": "string",
+        "value": "sample"
+    },
+    {
+        "char": 14,
+        "line": 1,
+        "type": "string",
+        "value": "file"
+    },
+    {
+        "char": 18,
+        "line": 1,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 2,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 3,
+        "type": "string",
+        "value": "this"
+    },
+    {
+        "char": 6,
+        "line": 3,
+        "type": "string",
+        "value": "file"
+    },
+    {
+        "char": 11,
+        "line": 3,
+        "type": "string",
+        "value": "contains"
+    },
+    {
+        "char": 20,
+        "line": 3,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 22,
+        "line": 3,
+        "type": "punctuation",
+        "value": "\""
+    },
+    {
+        "char": 23,
+        "line": 3,
+        "type": "string",
+        "value": "sample"
+    },
+    {
+        "char": 30,
+        "line": 3,
+        "type": "string",
+        "value": "of"
+    },
+    {
+        "char": 33,
+        "line": 3,
+        "type": "string",
+        "value": "plaintext"
+    },
+    {
+        "char": 42,
+        "line": 3,
+        "type": "punctuation",
+        "value": "\""
+    },
+    {
+        "char": 43,
+        "line": 3,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 46,
+        "line": 3,
+        "type": "string",
+        "value": "we"
+    },
+    {
+        "char": 48,
+        "line": 3,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "string",
+        "value": "can"
+    },
+    {
+        "char": 5,
+        "line": 4,
+        "type": "string",
+        "value": "tokenize"
+    },
+    {
+        "char": 14,
+        "line": 4,
+        "type": "string",
+        "value": "this"
+    },
+    {
+        "char": 18,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 21,
+        "line": 4,
+        "type": "string",
+        "value": "a"
+    },
+    {
+        "char": 22,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 23,
+        "line": 4,
+        "type": "string",
+        "value": "b"
+    },
+    {
+        "char": 24,
+        "line": 4,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 25,
+        "line": 4,
+        "type": "string",
+        "value": "c"
+    },
+    {
+        "char": 26,
+        "line": 4,
+        "type": "number",
+        "value": "1"
+    },
+    {
+        "char": 27,
+        "line": 4,
+        "type": "string",
+        "value": "d"
+    },
+    {
+        "char": 28,
+        "line": 4,
+        "type": "number",
+        "value": "2"
+    },
+    {
+        "char": 29,
+        "line": 4,
+        "type": "string",
+        "value": "e"
+    },
+    {
+        "char": 30,
+        "line": 4,
+        "type": "punctuation",
+        "value": "!"
+    },
+    {
+        "char": 31,
+        "line": 4,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 5,
+        "type": "string",
+        "value": "good"
+    },
+    {
+        "char": 5,
+        "line": 5,
+        "type": "punctuation",
+        "value": "-"
+    },
+    {
+        "char": 6,
+        "line": 5,
+        "type": "string",
+        "value": "bye"
+    },
+    {
+        "char": 9,
+        "line": 5,
+        "type": "punctuation",
+        "value": "."
+    },
+    {
+        "char": 10,
+        "line": 5,
+        "type": "newline",
+        "value": "\n"
+    },
+    {
+        "char": 1,
+        "line": 6,
+        "type": "newline",
+        "value": "\n"
+    }
+]
diff --git a/tokenizer/plaintext/input.txt b/tokenizer/plaintext/input.txt
new file mode 100644
index 0000000..4c059cf
--- /dev/null
+++ b/tokenizer/plaintext/input.txt
@@ -0,0 +1,6 @@
+    A Sample File
+
+This file contains 1 "sample of plaintext".  We
+can tokenize THIS.  a.b.c1d2e!
+Good-bye.
+
diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp
new file mode 100644
index 0000000..ba54107
--- /dev/null
+++ b/tokenizer/plaintext/plaintext_tokenizer.cpp
@@ -0,0 +1,183 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cassert>
+#include <iomanip>
+#include <ctype.h>
+#include <cstdio>
+#include "nlohmann/json.hpp"
+
+void usage(const std::string &program) {
+  std::cerr << "Usage: " << program << " [--ignore_punctuation] [--to_lower] [--ignore_numbers] [--ignore_newlines] < INPUT_FILE.txt > OUTPUT_FILE.json" << std::endl;
+  exit(0);
+}
+
+
+int main(int argc, char* argv[]) {
+
+  // ------------------------------
+  // handle arguments
+  bool ignore_punctuation = false;
+  bool to_lower = false;
+  bool ignore_numbers = false;
+  bool ignore_newlines = false;
+  for (int i = 1; i < argc; i++) {
+    if (std::string(argv[i]) == std::string("--ignore_punctuation")) {
+      ignore_punctuation = true;
+    } else if (std::string(argv[i]) == std::string("--to_lower")) {
+      to_lower = true;
+    } else if (std::string(argv[i]) == std::string("--ignore_numbers")) {
+      ignore_numbers = true;
+    } else if (std::string(argv[i]) == std::string("--ignore_newlines")) {
+      ignore_newlines = true;
+    } else {
+      std::cerr << "ERROR: Unknown option '" << argv[i] << "'" << std::endl;
+      usage(argv[0]);
+    }
+  }
+
+  // ------------------------------
+  // helper variables
+  nlohmann::json tokens;
+  char c;
+  std::string token;
+  int row = 1;
+  int col = 1;
+  int start_row = -1;
+  int start_col = -1;
+  bool last_was_alpha = false;
+  bool last_was_digit = false;
+
+  // ------------------------------
+  // loop to read the input file file
+  while (std::cin >> std::noskipws >> c) {
+    bool is_punctuation = !isspace(c) && !std::isdigit(c) && !std::isalpha(c);
+
+    // ------------------------------
+    // decide when to break the current string
+    // break on spaces, punctuation (any symbol), or if we switch between letters and numbers
+    if (isspace(c) ||
+        is_punctuation ||
+        (last_was_alpha && std::isdigit(c)) ||
+        (last_was_digit && std::isalpha(c))) {
+      if (token != "") {
+        // save this token!
+        std::map<std::string,nlohmann::json> tmp;
+        tmp["line"]=start_row;
+        tmp["char"]=start_col;
+        if (last_was_digit) {
+          assert (!last_was_alpha);
+          tmp["type"]="number";
+          tmp["value"]=std::stoi(token);
+        } else {
+          assert (last_was_alpha);
+          tmp["type"]="string";
+          tmp["value"]=token;
+        }
+        tmp["value"]=token;
+        tokens.push_back(tmp);
+        token="";
+        last_was_alpha = false;
+        last_was_digit = false;
+      }
+    }
+
+    // ------------------------------
+    // decide whether to add this character to the current string
+    if (isspace(c)) {
+      // never add spaces
+    }
+    // ------------------------------
+    else if (is_punctuation) {
+      assert (token == "");
+      assert (last_was_alpha == false);
+      assert (last_was_digit == false);
+      // only add punctuation if its not being ignored
+      // (punctuation is always a single symbol character per token)
+      if (!ignore_punctuation) {
+        std::map<std::string,nlohmann::json> tmp;
+        tmp["line"]=row;
+        tmp["char"]=col;
+        tmp["type"]="punctuation";
+        tmp["value"]=std::string(1,c);
+        tokens.push_back(tmp);
+      }
+    }
+    // ------------------------------
+    else if (std::isdigit(c)) {
+      assert (last_was_alpha == false);
+      // only add digits/numbers if they are not being ignored
+      // numbers will be 1 or more digits 0-9.
+      // We break tokens between letters & numbers.
+      if (!ignore_numbers) {
+        if (token=="") {
+          start_row = row;
+          start_col = col;
+          last_was_digit=true;
+        } else {
+          assert (last_was_digit == true);
+        }
+        token.push_back(c);
+      }
+    }
+    // ------------------------------
+    else if (isalpha(c)) {
+      assert (last_was_digit == false);
+      // string tokens will be 1 or more letters a-z or A-Z
+      if (token=="") {
+        start_row = row;
+        start_col = col;
+        last_was_alpha=true;
+      } else {
+        assert (last_was_alpha == true);
+      }
+      // option to lowercase
+      if (to_lower) {
+        c = std::tolower(c);
+      }
+      token.push_back(c);
+    }
+
+    // ------------------------------
+    if (c == '\n') {
+      // advance to the next row/line
+      assert (token=="");
+      if (!ignore_newlines) {
+        // output a token for the newline
+        std::map<std::string,nlohmann::json> tmp;
+        tmp["line"]=row;
+        tmp["char"]=col;
+        tmp["type"]="newline";
+        tmp["value"]="\n";
+        tokens.push_back(tmp);
+      }
+      row++;
+      col=1;
+    } else {
+      // advance to the next column/character
+      col++;
+    }
+  } 
+
+  // ------------------------------
+  if (token != "") {
+    // save the last token (if there was no space or newline at the end of the file)
+    std::map<std::string,nlohmann::json> tmp;
+    tmp["line"]=start_row;
+    tmp["char"]=start_col;
+    if (last_was_digit) {
+      assert (!last_was_alpha);
+      tmp["type"]="number";
+      tmp["value"]=std::stoi(token);
+    } else {
+      assert (last_was_alpha);
+      tmp["type"]="string";
+      tmp["value"]=token;
+    }
+    tokens.push_back(tmp);
+  }
+
+  // ------------------------------
+  // export/save in json format
+  std::cout << tokens.dump(4) << std::endl;
+}

From 3fd9ae01f3273b1a806b3fe208d42574261d4a85 Mon Sep 17 00:00:00 2001
From: Barb Cutler <bmcutler@gmail.com>
Date: Mon, 4 Jun 2018 14:42:48 -0400
Subject: [PATCH 2/2] duplicate line preventing number values

---
 tokenizer/plaintext/plaintext_tokenizer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp
index ba54107..8fd793d 100644
--- a/tokenizer/plaintext/plaintext_tokenizer.cpp
+++ b/tokenizer/plaintext/plaintext_tokenizer.cpp
@@ -74,7 +74,6 @@ int main(int argc, char* argv[]) {
           tmp["type"]="string";
           tmp["value"]=token;
         }
-        tmp["value"]=token;
         tokens.push_back(tmp);
         token="";
         last_was_alpha = false;