From 356bdf81ecd72302ef8b06d559895a337d9d688f Mon Sep 17 00:00:00 2001 From: Robert Meyer Date: Sat, 10 Mar 2018 07:01:50 +0100 Subject: [PATCH 1/3] Added new readability scores and tests --- CHANGELOG.md | 4 ++++ tests/filters/stylemeasures_test.py | 15 +++++++++++++++ trufflepig/filters/stylemeasures.py | 15 +++++++++++++++ trufflepig/model.py | 2 ++ trufflepig/preprocessing.py | 13 +++++++++++-- 5 files changed, 47 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 739bbf8..8873039 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +*0.6.0a* - 2018-03-08 + +* Two new readability scores + *0.5.0a* - 2018-03-07 * Fixed tfids error diff --git a/tests/filters/stylemeasures_test.py b/tests/filters/stylemeasures_test.py index 0d3b199..8005415 100644 --- a/tests/filters/stylemeasures_test.py +++ b/tests/filters/stylemeasures_test.py @@ -66,6 +66,11 @@ def test_count_pronouns(): assert result == 2 +def test_count_characters(): + result = tpsm.count_characters(['hi', 'mark']) + assert result == 6 + + def test_count_letters(): result = tpsm.count_letters('hallo54 my namE!!') assert result == 11 @@ -93,6 +98,16 @@ def test_flesch_kincaid_index(): assert result == 77.90500000000002 +def test_automated_readability_index(): + result = tpsm.automated_readability_index(1000, 100, 10) + assert result == 30.67 + + +def test_coleman_liau_index(): + result = tpsm.coleman_liau_index(1000, 100, 10) + assert result == 40.03999999999999 + + def test_adverb_estimate(): result = tpsm.adverb_estimate(['i', 'am', 'heavily', 'in', 'use']) assert result == 1 diff --git a/trufflepig/filters/stylemeasures.py b/trufflepig/filters/stylemeasures.py index 0688b06..33b92b2 100644 --- a/trufflepig/filters/stylemeasures.py +++ b/trufflepig/filters/stylemeasures.py @@ -126,6 +126,10 @@ def compute_average_puncitation(text_list): return np.mean(punctuation) +def count_characters(tokens): + return sum(len(x) for x in tokens) + + def count_connectors(tokens): return sum(1 for x in tokens if x in CONNECTORS) @@ -149,9 +153,20 @@ def flesch_kincaid_index(num_syllables, num_words, num_sentences): def smog_index(num_complex_words, num_sentences): + """https://en.wikipedia.org/wiki/SMOG""" return 1.0430 * np.sqrt(num_complex_words * 30 / num_sentences) + 3.1291 +def automated_readability_index(num_chars, num_words, num_sentences): + """https://en.wikipedia.org/wiki/Automated_readability_index""" + return 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43 + + +def coleman_liau_index(num_chars, num_words, num_sentences): + """https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index""" + return 0.0588 * 100 * num_chars / num_words - 0.296 * 100 * num_sentences / num_words - 15.8 + + def adverb_estimate(tokens): return sum([1 for x in tokens if x.endswith('ly')]) diff --git a/trufflepig/model.py b/trufflepig/model.py index 00e4424..0747c8a 100644 --- a/trufflepig/model.py +++ b/trufflepig/model.py @@ -48,6 +48,8 @@ 'gunning_fog_index', 'flesch_kincaid_index', 'smog_index', + 'automated_readability_index', + 'coleman_liau_index', 'average_syllables', 'syllable_variance', 'syllable_skew', diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py index eaa5302..14a4bdb 100644 --- a/trufflepig/preprocessing.py +++ b/trufflepig/preprocessing.py @@ -337,8 +337,9 @@ def preprocess(post_df, ncores=4, chunksize=500, post_df['unique_words'] = post_df.tokens.apply(lambda x: len(set(x))) post_df['unique_ratio'] = post_df.unique_words / post_df.num_words - logger.info('Computing characters per word') - post_df['chars_per_word'] = post_df.body_length / post_df.num_words + logger.info('Computing characters and characters per word') + post_df['num_chars'] = post_df.tokens.apply(lambda x: tfsm.count_characters(x)) + post_df['chars_per_word'] = post_df.num_chars / post_df.num_words logger.info('Counting connectors') post_df['num_connectors'] = post_df.tokens.apply(lambda x: tfsm.count_connectors(x)) @@ -365,6 +366,8 @@ def preprocess(post_df, ncores=4, chunksize=500, post_df['syllable_variance'] = post_df.token_syllables.apply(lambda x: np.var(x)) post_df['syllable_skew'] = post_df.token_syllables.apply(lambda x: spst.skew(x)) post_df['syllable_kurtosis'] = post_df.token_syllables.apply(lambda x: spst.kurtosis(x)) + + logger.info('Computing readability indices') post_df['gunning_fog_index'] = tfsm.gunning_fog_index(num_words=post_df.num_words, num_complex_words=post_df.num_complex_words, num_sentences=post_df.num_sentences) @@ -373,6 +376,12 @@ def preprocess(post_df, ncores=4, chunksize=500, num_sentences=post_df.num_sentences) post_df['smog_index']= tfsm.smog_index(num_complex_words=post_df.num_complex_words, num_sentences=post_df.num_sentences) + post_df['automated_readability_index'] = tfsm.automated_readability_index(num_chars=post_df.num_chars, + num_words=post_df.num_words, + num_sentences=post_df.num_sentences) + post_df['coleman_liau_index'] = tfsm.coleman_liau_index(num_chars=post_df.num_chars, + num_words=post_df.num_words, + num_sentences=post_df.num_sentences) post_df.dropna(inplace=True) logger.info('Final data set has {} shape'.format(post_df.shape)) From bc28a89a44f38fcf6e1537edc7932df33995467b Mon Sep 17 00:00:00 2001 From: Robert Meyer Date: Sat, 10 Mar 2018 07:21:15 +0100 Subject: [PATCH 2/3] New linter errors --- CHANGELOG.md | 3 ++- requirements.txt | 1 + tests/filters/stylemeasures_test.py | 6 ++++++ trufflepig/filters/stylemeasures.py | 6 ++++++ trufflepig/model.py | 3 ++- trufflepig/preprocessing.py | 7 +++++++ 6 files changed, 24 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8873039..76ee0e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -*0.6.0a* - 2018-03-08 +*0.6.0a* - 2018-03-10 * Two new readability scores +* New lint errors *0.5.0a* - 2018-03-07 diff --git a/requirements.txt b/requirements.txt index 067d95d..b0f24d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,5 +11,6 @@ scikit-learn==0.19.1 langdetect==1.0.7 pyenchant==2.0.0 pyphen==0.9.4 +proselint==0.8.0 language-check==1.1 steem==0.18.103 \ No newline at end of file diff --git a/tests/filters/stylemeasures_test.py b/tests/filters/stylemeasures_test.py index 8005415..94ef2ea 100644 --- a/tests/filters/stylemeasures_test.py +++ b/tests/filters/stylemeasures_test.py @@ -113,6 +113,12 @@ def test_adverb_estimate(): assert result == 1 +def test_lint_errors(): + text = 'She works hard for the money. So hard! John is very unique.' + result = tpsm.lint_errors(text) + assert result == 2 + + def test_grammar_errors(): counter = tpsm.GrammarErrorCounter() sentences = 'She earn moneyt. I did nothing wrogn. He go in Colorado.' diff --git a/trufflepig/filters/stylemeasures.py b/trufflepig/filters/stylemeasures.py index 33b92b2..2d7ad52 100644 --- a/trufflepig/filters/stylemeasures.py +++ b/trufflepig/filters/stylemeasures.py @@ -6,6 +6,7 @@ import langdetect import language_check import pyphen +import proselint from enchant.checker import SpellChecker CAPS = "([A-Z])" @@ -171,6 +172,11 @@ def adverb_estimate(tokens): return sum([1 for x in tokens if x.endswith('ly')]) +def lint_errors(text): + errors = proselint.tools.lint(text) + return len(errors) + + class SyllableConverter(object): def __init__(self, language='en'): self.dic = pyphen.Pyphen(lang=language) diff --git a/trufflepig/model.py b/trufflepig/model.py index 0747c8a..1d3198f 100644 --- a/trufflepig/model.py +++ b/trufflepig/model.py @@ -54,7 +54,8 @@ 'syllable_variance', 'syllable_skew', 'syllable_kurtosis', - 'adverbs_per_sentence'] + 'adverbs_per_sentence', + 'lint_errors_per_sentence'] # output variables for regressor TARGETS = ['reward', 'votes'] diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py index 14a4bdb..99e5b56 100644 --- a/trufflepig/preprocessing.py +++ b/trufflepig/preprocessing.py @@ -383,6 +383,13 @@ def preprocess(post_df, ncores=4, chunksize=500, num_words=post_df.num_words, num_sentences=post_df.num_sentences) + logger.info('Counting proselint errors') + post_df['lint_errors'] = apply_parallel(tfsm.lint_errors, + post_df.filtered_body, + ncores=ncores, + chunksize=chunksize) + post_df['lint_errors_per_sentence'] = post_df.lint_errors / post_df.num_sentences + post_df.dropna(inplace=True) logger.info('Final data set has {} shape'.format(post_df.shape)) From 365004f7b3d0eef96131f99b3c7c938a66257dec Mon Sep 17 00:00:00 2001 From: Robert Meyer Date: Sat, 10 Mar 2018 07:51:37 +0100 Subject: [PATCH 3/3] Removed linting again due to errors --- CHANGELOG.md | 1 - requirements.txt | 1 - tests/filters/stylemeasures_test.py | 6 ------ trufflepig/filters/stylemeasures.py | 6 ------ trufflepig/model.py | 3 +-- trufflepig/preprocessing.py | 7 ------- 6 files changed, 1 insertion(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76ee0e1..06e1431 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,6 @@ *0.6.0a* - 2018-03-10 * Two new readability scores -* New lint errors *0.5.0a* - 2018-03-07 diff --git a/requirements.txt b/requirements.txt index b0f24d1..067d95d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,5 @@ scikit-learn==0.19.1 langdetect==1.0.7 pyenchant==2.0.0 pyphen==0.9.4 -proselint==0.8.0 language-check==1.1 steem==0.18.103 \ No newline at end of file diff --git a/tests/filters/stylemeasures_test.py b/tests/filters/stylemeasures_test.py index 94ef2ea..8005415 100644 --- a/tests/filters/stylemeasures_test.py +++ b/tests/filters/stylemeasures_test.py @@ -113,12 +113,6 @@ def test_adverb_estimate(): assert result == 1 -def test_lint_errors(): - text = 'She works hard for the money. So hard! John is very unique.' - result = tpsm.lint_errors(text) - assert result == 2 - - def test_grammar_errors(): counter = tpsm.GrammarErrorCounter() sentences = 'She earn moneyt. I did nothing wrogn. He go in Colorado.' diff --git a/trufflepig/filters/stylemeasures.py b/trufflepig/filters/stylemeasures.py index 2d7ad52..33b92b2 100644 --- a/trufflepig/filters/stylemeasures.py +++ b/trufflepig/filters/stylemeasures.py @@ -6,7 +6,6 @@ import langdetect import language_check import pyphen -import proselint from enchant.checker import SpellChecker CAPS = "([A-Z])" @@ -172,11 +171,6 @@ def adverb_estimate(tokens): return sum([1 for x in tokens if x.endswith('ly')]) -def lint_errors(text): - errors = proselint.tools.lint(text) - return len(errors) - - class SyllableConverter(object): def __init__(self, language='en'): self.dic = pyphen.Pyphen(lang=language) diff --git a/trufflepig/model.py b/trufflepig/model.py index 1d3198f..0747c8a 100644 --- a/trufflepig/model.py +++ b/trufflepig/model.py @@ -54,8 +54,7 @@ 'syllable_variance', 'syllable_skew', 'syllable_kurtosis', - 'adverbs_per_sentence', - 'lint_errors_per_sentence'] + 'adverbs_per_sentence'] # output variables for regressor TARGETS = ['reward', 'votes'] diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py index 99e5b56..14a4bdb 100644 --- a/trufflepig/preprocessing.py +++ b/trufflepig/preprocessing.py @@ -383,13 +383,6 @@ def preprocess(post_df, ncores=4, chunksize=500, num_words=post_df.num_words, num_sentences=post_df.num_sentences) - logger.info('Counting proselint errors') - post_df['lint_errors'] = apply_parallel(tfsm.lint_errors, - post_df.filtered_body, - ncores=ncores, - chunksize=chunksize) - post_df['lint_errors_per_sentence'] = post_df.lint_errors / post_df.num_sentences - post_df.dropna(inplace=True) logger.info('Final data set has {} shape'.format(post_df.shape))