diff --git a/tools/tests/__init__.py b/tools/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/tests/test_data.py b/tools/tests/test_data.py
new file mode 100644
index 0000000..ef2a289
--- /dev/null
+++ b/tools/tests/test_data.py
@@ -0,0 +1,45 @@
+import string
+import unittest
+
+from tools.utils.data import preprocess
+
+
+class TestPreprocess(unittest.TestCase):
+    def test_casing(self):
+        self.assertEqual(preprocess(string.ascii_lowercase), string.ascii_lowercase)
+        self.assertEqual(preprocess(string.ascii_uppercase), string.ascii_lowercase)
+        self.assertEqual(preprocess(string.ascii_letters), string.ascii_letters.lower())
+
+    def test_spaces(self):
+        nbsp = "on\xa0the\xa0playground"
+        # 0x20 is ascii space.
+        self.assertEqual(preprocess(nbsp), "on\x20the\x20playground")
+        self.assertEqual(
+            preprocess("one lip claps   tongue slipping"), "one lip claps tongue slipping"
+        )
+
+    def test_dashes(self):
+        tshirt = "t-shirts out in force"
+        coffee = "fresh-ground french-pressed coffee"
+        war = "bright clouds bleed a war–red"
+        # NOTE: t-shirt *should* get parsed as tshirt.
+        self.assertEqual(preprocess(tshirt), "t shirts out in force")
+        self.assertEqual(preprocess(coffee), "fresh ground french pressed coffee")
+        self.assertEqual(preprocess(war), "bright clouds bleed a war red")
+
+    def test_nonascii(self):
+        russian = "это сущий разврат this is a veritable debauch"
+        self.assertEqual(preprocess(russian), "this is a veritable debauch")
+
+    def test_digits(self):
+        self.assertEqual(preprocess(string.digits), string.digits)
+        self.assertEqual(preprocess("abcd1234"), "abcd1234")
+
+    def test_slashes(self):
+        self.assertEqual(preprocess("one/two"), "one/two")
+
+    def test_quotes(self):
+        self.assertEqual(preprocess(r"'\""), "'")
+
+    def test_punctuation(self):
+        self.assertEqual(preprocess(string.punctuation), "' /")
diff --git a/tools/tests/test_nlp.py b/tools/tests/test_nlp.py
new file mode 100644
index 0000000..be67451
--- /dev/null
+++ b/tools/tests/test_nlp.py
@@ -0,0 +1,49 @@
+import unittest
+
+from nltk.corpus import stopwords as nltk_stop_words
+from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stop_words
+from spacy.lang.en.stop_words import STOP_WORDS as spacy_stop_words
+
+from tools.utils.nlp import STOPWORDS, lemmatize, remove_stopwords
+
+
+class NlpTest(unittest.TestCase):
+    def test_stopwords(self):
+        nltk_set = set(nltk_stop_words.words("english"))
+        sklearn_set = set(sklearn_stop_words)
+        spacy_set = set(spacy_stop_words)
+
+        diff1 = nltk_set - sklearn_set
+        diff2 = nltk_set - spacy_set
+        diff3 = sklearn_set - nltk_set
+        diff4 = sklearn_set - spacy_set
+        diff5 = spacy_set - nltk_set
+        diff6 = spacy_set - sklearn_set
+
+        differences = set.union(diff1, diff2, diff3, diff4, diff5, diff6)
+        for diff in differences:
+            self.assertIn(diff, STOPWORDS)
+
+    def test_stopword_removal(self):
+        stopwords = ["i", "am", "a", "but", "you", "are", "the"]
+        for stopword in stopwords:
+            self.assertIn(stopword, STOPWORDS)
+
+        phrase = "i am a heathen but you are the worst"
+        self.assertEqual(remove_stopwords(phrase), "heathen worst")
+
+    def test_lemmatization(self):
+        line1 = "she's my friend"
+        line2 = "i'd rather not meet"
+        line3 = "reading the stock futures"
+        line4 = "the leaves still clinging"
+        line5 = "beehives beneath kiwifruit vines"
+        line6 = "warm winter rain/the beach and i/collecting sea glass"
+
+        self.assertEqual(lemmatize(line1), "-PRON- be -PRON- friend")
+        self.assertEqual(lemmatize(line2), "-PRON- would rather not meet")
+        self.assertEqual(lemmatize(line3), "read the stock future")
+        self.assertEqual(lemmatize(line4), "the leaf still cling")
+        self.assertEqual(lemmatize(line5), "beehive beneath kiwifruit vine")
+        # TODO: Why does "i'd" lemmatize to "-PRON- would" but "i" lemmatize to "i"?
+        self.assertEqual(lemmatize(line6), "warm winter rain / the beach and i / collect sea glass")