Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
94 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import string | ||
import unittest | ||
|
||
from tools.utils.data import preprocess | ||
|
||
|
||
class TestPreprocess(unittest.TestCase): | ||
def test_casing(self): | ||
self.assertEqual(preprocess(string.ascii_lowercase), string.ascii_lowercase) | ||
self.assertEqual(preprocess(string.ascii_uppercase), string.ascii_lowercase) | ||
self.assertEqual(preprocess(string.ascii_letters), string.ascii_letters.lower()) | ||
|
||
def test_spaces(self): | ||
nbsp = "on\xa0the\xa0playground" | ||
# 0x20 is ascii space. | ||
self.assertEqual(preprocess(nbsp), "on\x20the\x20playground") | ||
self.assertEqual( | ||
preprocess("one lip claps tongue slipping"), "one lip claps tongue slipping" | ||
) | ||
|
||
def test_dashes(self): | ||
tshirt = "t-shirts out in force" | ||
coffee = "fresh-ground french-pressed coffee" | ||
war = "bright clouds bleed a war–red" | ||
# NOTE: t-shirt *should* get parsed as tshirt. | ||
self.assertEqual(preprocess(tshirt), "t shirts out in force") | ||
self.assertEqual(preprocess(coffee), "fresh ground french pressed coffee") | ||
self.assertEqual(preprocess(war), "bright clouds bleed a war red") | ||
|
||
def test_nonascii(self): | ||
russian = "это сущий разврат this is a veritable debauch" | ||
self.assertEqual(preprocess(russian), "this is a veritable debauch") | ||
|
||
def test_digits(self): | ||
self.assertEqual(preprocess(string.digits), string.digits) | ||
self.assertEqual(preprocess("abcd1234"), "abcd1234") | ||
|
||
def test_slashes(self): | ||
self.assertEqual(preprocess("one/two"), "one/two") | ||
|
||
def test_quotes(self): | ||
self.assertEqual(preprocess(r"'\""), "'") | ||
|
||
def test_punctuation(self): | ||
self.assertEqual(preprocess(string.punctuation), "' /") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import unittest | ||
|
||
from nltk.corpus import stopwords as nltk_stop_words | ||
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stop_words | ||
from spacy.lang.en.stop_words import STOP_WORDS as spacy_stop_words | ||
|
||
from tools.utils.nlp import STOPWORDS, lemmatize, remove_stopwords | ||
|
||
|
||
class NlpTest(unittest.TestCase): | ||
def test_stopwords(self): | ||
nltk_set = set(nltk_stop_words.words("english")) | ||
sklearn_set = set(sklearn_stop_words) | ||
spacy_set = set(spacy_stop_words) | ||
|
||
diff1 = nltk_set - sklearn_set | ||
diff2 = nltk_set - spacy_set | ||
diff3 = sklearn_set - nltk_set | ||
diff4 = sklearn_set - spacy_set | ||
diff5 = spacy_set - nltk_set | ||
diff6 = spacy_set - sklearn_set | ||
|
||
differences = set.union(diff1, diff2, diff3, diff4, diff5, diff6) | ||
for diff in differences: | ||
self.assertIn(diff, STOPWORDS) | ||
|
||
def test_stopword_removal(self): | ||
stopwords = ["i", "am", "a", "but", "you", "are", "the"] | ||
for stopword in stopwords: | ||
self.assertIn(stopword, STOPWORDS) | ||
|
||
phrase = "i am a heathen but you are the worst" | ||
self.assertEqual(remove_stopwords(phrase), "heathen worst") | ||
|
||
def test_lemmatization(self): | ||
line1 = "she's my friend" | ||
line2 = "i'd rather not meet" | ||
line3 = "reading the stock futures" | ||
line4 = "the leaves still clinging" | ||
line5 = "beehives beneath kiwifruit vines" | ||
line6 = "warm winter rain/the beach and i/collecting sea glass" | ||
|
||
self.assertEqual(lemmatize(line1), "-PRON- be -PRON- friend") | ||
self.assertEqual(lemmatize(line2), "-PRON- would rather not meet") | ||
self.assertEqual(lemmatize(line3), "read the stock future") | ||
self.assertEqual(lemmatize(line4), "the leaf still cling") | ||
self.assertEqual(lemmatize(line5), "beehive beneath kiwifruit vine") | ||
# TODO: Why does "i'd" lemmatize to "-PRON- would" but "i" lemmatize to "i"? | ||
self.assertEqual(lemmatize(line6), "warm winter rain / the beach and i / collect sea glass") |