diff --git a/tools/tests/__init__.py b/tools/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/tests/test_data.py b/tools/tests/test_data.py new file mode 100644 index 0000000..ef2a289 --- /dev/null +++ b/tools/tests/test_data.py @@ -0,0 +1,45 @@ +import string +import unittest + +from tools.utils.data import preprocess + + +class TestPreprocess(unittest.TestCase): + def test_casing(self): + self.assertEqual(preprocess(string.ascii_lowercase), string.ascii_lowercase) + self.assertEqual(preprocess(string.ascii_uppercase), string.ascii_lowercase) + self.assertEqual(preprocess(string.ascii_letters), string.ascii_letters.lower()) + + def test_spaces(self): + nbsp = "on\xa0the\xa0playground" + # 0x20 is ascii space. + self.assertEqual(preprocess(nbsp), "on\x20the\x20playground") + self.assertEqual( + preprocess("one lip claps tongue slipping"), "one lip claps tongue slipping" + ) + + def test_dashes(self): + tshirt = "t-shirts out in force" + coffee = "fresh-ground french-pressed coffee" + war = "bright clouds bleed a war–red" + # NOTE: t-shirt *should* get parsed as tshirt. + self.assertEqual(preprocess(tshirt), "t shirts out in force") + self.assertEqual(preprocess(coffee), "fresh ground french pressed coffee") + self.assertEqual(preprocess(war), "bright clouds bleed a war red") + + def test_nonascii(self): + russian = "это сущий разврат this is a veritable debauch" + self.assertEqual(preprocess(russian), "this is a veritable debauch") + + def test_digits(self): + self.assertEqual(preprocess(string.digits), string.digits) + self.assertEqual(preprocess("abcd1234"), "abcd1234") + + def test_slashes(self): + self.assertEqual(preprocess("one/two"), "one/two") + + def test_quotes(self): + self.assertEqual(preprocess(r"'\""), "'") + + def test_punctuation(self): + self.assertEqual(preprocess(string.punctuation), "' /") diff --git a/tools/tests/test_nlp.py b/tools/tests/test_nlp.py new file mode 100644 index 0000000..be67451 --- /dev/null +++ b/tools/tests/test_nlp.py @@ -0,0 +1,49 @@ +import unittest + +from nltk.corpus import stopwords as nltk_stop_words +from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stop_words +from spacy.lang.en.stop_words import STOP_WORDS as spacy_stop_words + +from tools.utils.nlp import STOPWORDS, lemmatize, remove_stopwords + + +class NlpTest(unittest.TestCase): + def test_stopwords(self): + nltk_set = set(nltk_stop_words.words("english")) + sklearn_set = set(sklearn_stop_words) + spacy_set = set(spacy_stop_words) + + diff1 = nltk_set - sklearn_set + diff2 = nltk_set - spacy_set + diff3 = sklearn_set - nltk_set + diff4 = sklearn_set - spacy_set + diff5 = spacy_set - nltk_set + diff6 = spacy_set - sklearn_set + + differences = set.union(diff1, diff2, diff3, diff4, diff5, diff6) + for diff in differences: + self.assertIn(diff, STOPWORDS) + + def test_stopword_removal(self): + stopwords = ["i", "am", "a", "but", "you", "are", "the"] + for stopword in stopwords: + self.assertIn(stopword, STOPWORDS) + + phrase = "i am a heathen but you are the worst" + self.assertEqual(remove_stopwords(phrase), "heathen worst") + + def test_lemmatization(self): + line1 = "she's my friend" + line2 = "i'd rather not meet" + line3 = "reading the stock futures" + line4 = "the leaves still clinging" + line5 = "beehives beneath kiwifruit vines" + line6 = "warm winter rain/the beach and i/collecting sea glass" + + self.assertEqual(lemmatize(line1), "-PRON- be -PRON- friend") + self.assertEqual(lemmatize(line2), "-PRON- would rather not meet") + self.assertEqual(lemmatize(line3), "read the stock future") + self.assertEqual(lemmatize(line4), "the leaf still cling") + self.assertEqual(lemmatize(line5), "beehive beneath kiwifruit vine") + # TODO: Why does "i'd" lemmatize to "-PRON- would" but "i" lemmatize to "i"? + self.assertEqual(lemmatize(line6), "warm winter rain / the beach and i / collect sea glass")