Skip to content

Commit

Permalink
unit test tools library
Browse files Browse the repository at this point in the history
  • Loading branch information
Notgnoshi committed Mar 18, 2019
1 parent 21915a9 commit 497eda1
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 0 deletions.
Empty file added tools/tests/__init__.py
Empty file.
45 changes: 45 additions & 0 deletions tools/tests/test_data.py
@@ -0,0 +1,45 @@
import string
import unittest

from tools.utils.data import preprocess


class TestPreprocess(unittest.TestCase):
def test_casing(self):
self.assertEqual(preprocess(string.ascii_lowercase), string.ascii_lowercase)
self.assertEqual(preprocess(string.ascii_uppercase), string.ascii_lowercase)
self.assertEqual(preprocess(string.ascii_letters), string.ascii_letters.lower())

def test_spaces(self):
nbsp = "on\xa0the\xa0playground"
# 0x20 is ascii space.
self.assertEqual(preprocess(nbsp), "on\x20the\x20playground")
self.assertEqual(
preprocess("one lip claps tongue slipping"), "one lip claps tongue slipping"
)

def test_dashes(self):
tshirt = "t-shirts out in force"
coffee = "fresh-ground french-pressed coffee"
war = "bright clouds bleed a war–red"
# NOTE: t-shirt *should* get parsed as tshirt.
self.assertEqual(preprocess(tshirt), "t shirts out in force")
self.assertEqual(preprocess(coffee), "fresh ground french pressed coffee")
self.assertEqual(preprocess(war), "bright clouds bleed a war red")

def test_nonascii(self):
russian = "это сущий разврат this is a veritable debauch"
self.assertEqual(preprocess(russian), "this is a veritable debauch")

def test_digits(self):
self.assertEqual(preprocess(string.digits), string.digits)
self.assertEqual(preprocess("abcd1234"), "abcd1234")

def test_slashes(self):
self.assertEqual(preprocess("one/two"), "one/two")

def test_quotes(self):
self.assertEqual(preprocess(r"'\""), "'")

def test_punctuation(self):
self.assertEqual(preprocess(string.punctuation), "' /")
49 changes: 49 additions & 0 deletions tools/tests/test_nlp.py
@@ -0,0 +1,49 @@
import unittest

from nltk.corpus import stopwords as nltk_stop_words
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stop_words
from spacy.lang.en.stop_words import STOP_WORDS as spacy_stop_words

from tools.utils.nlp import STOPWORDS, lemmatize, remove_stopwords


class NlpTest(unittest.TestCase):
def test_stopwords(self):
nltk_set = set(nltk_stop_words.words("english"))
sklearn_set = set(sklearn_stop_words)
spacy_set = set(spacy_stop_words)

diff1 = nltk_set - sklearn_set
diff2 = nltk_set - spacy_set
diff3 = sklearn_set - nltk_set
diff4 = sklearn_set - spacy_set
diff5 = spacy_set - nltk_set
diff6 = spacy_set - sklearn_set

differences = set.union(diff1, diff2, diff3, diff4, diff5, diff6)
for diff in differences:
self.assertIn(diff, STOPWORDS)

def test_stopword_removal(self):
stopwords = ["i", "am", "a", "but", "you", "are", "the"]
for stopword in stopwords:
self.assertIn(stopword, STOPWORDS)

phrase = "i am a heathen but you are the worst"
self.assertEqual(remove_stopwords(phrase), "heathen worst")

def test_lemmatization(self):
line1 = "she's my friend"
line2 = "i'd rather not meet"
line3 = "reading the stock futures"
line4 = "the leaves still clinging"
line5 = "beehives beneath kiwifruit vines"
line6 = "warm winter rain/the beach and i/collecting sea glass"

self.assertEqual(lemmatize(line1), "-PRON- be -PRON- friend")
self.assertEqual(lemmatize(line2), "-PRON- would rather not meet")
self.assertEqual(lemmatize(line3), "read the stock future")
self.assertEqual(lemmatize(line4), "the leaf still cling")
self.assertEqual(lemmatize(line5), "beehive beneath kiwifruit vine")
# TODO: Why does "i'd" lemmatize to "-PRON- would" but "i" lemmatize to "i"?
self.assertEqual(lemmatize(line6), "warm winter rain / the beach and i / collect sea glass")

0 comments on commit 497eda1

Please sign in to comment.