From 356bdf81ecd72302ef8b06d559895a337d9d688f Mon Sep 17 00:00:00 2001
From: Robert Meyer <robert.meyer@flixbus.com>
Date: Sat, 10 Mar 2018 07:01:50 +0100
Subject: [PATCH 1/3] Added new readability scores and tests

---
 CHANGELOG.md                        |  4 ++++
 tests/filters/stylemeasures_test.py | 15 +++++++++++++++
 trufflepig/filters/stylemeasures.py | 15 +++++++++++++++
 trufflepig/model.py                 |  2 ++
 trufflepig/preprocessing.py         | 13 +++++++++++--
 5 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 739bbf8..8873039 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+*0.6.0a* - 2018-03-08
+
+* Two new readability scores
+
 *0.5.0a* - 2018-03-07
 
 * Fixed tfids error
diff --git a/tests/filters/stylemeasures_test.py b/tests/filters/stylemeasures_test.py
index 0d3b199..8005415 100644
--- a/tests/filters/stylemeasures_test.py
+++ b/tests/filters/stylemeasures_test.py
@@ -66,6 +66,11 @@ def test_count_pronouns():
     assert result == 2
 
 
+def test_count_characters():
+    result = tpsm.count_characters(['hi', 'mark'])
+    assert result == 6
+
+
 def test_count_letters():
     result = tpsm.count_letters('hallo54 my namE!!')
     assert result == 11
@@ -93,6 +98,16 @@ def test_flesch_kincaid_index():
     assert result == 77.90500000000002
 
 
+def test_automated_readability_index():
+    result = tpsm.automated_readability_index(1000, 100, 10)
+    assert result == 30.67
+
+
+def test_coleman_liau_index():
+    result = tpsm.coleman_liau_index(1000, 100, 10)
+    assert result == 40.03999999999999
+
+
 def test_adverb_estimate():
     result = tpsm.adverb_estimate(['i', 'am', 'heavily', 'in', 'use'])
     assert result == 1
diff --git a/trufflepig/filters/stylemeasures.py b/trufflepig/filters/stylemeasures.py
index 0688b06..33b92b2 100644
--- a/trufflepig/filters/stylemeasures.py
+++ b/trufflepig/filters/stylemeasures.py
@@ -126,6 +126,10 @@ def compute_average_puncitation(text_list):
     return np.mean(punctuation)
 
 
+def count_characters(tokens):
+    return sum(len(x) for x in tokens)
+
+
 def count_connectors(tokens):
     return sum(1 for x in tokens if x in CONNECTORS)
 
@@ -149,9 +153,20 @@ def flesch_kincaid_index(num_syllables, num_words, num_sentences):
 
 
 def smog_index(num_complex_words, num_sentences):
+    """https://en.wikipedia.org/wiki/SMOG"""
     return 1.0430 * np.sqrt(num_complex_words * 30 / num_sentences) + 3.1291
 
 
+def automated_readability_index(num_chars, num_words, num_sentences):
+    """https://en.wikipedia.org/wiki/Automated_readability_index"""
+    return 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43
+
+
+def coleman_liau_index(num_chars, num_words, num_sentences):
+    """https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index"""
+    return 0.0588 * 100 * num_chars / num_words - 0.296 * 100 * num_sentences / num_words - 15.8
+
+
 def adverb_estimate(tokens):
     return sum([1 for x in tokens if x.endswith('ly')])
 
diff --git a/trufflepig/model.py b/trufflepig/model.py
index 00e4424..0747c8a 100644
--- a/trufflepig/model.py
+++ b/trufflepig/model.py
@@ -48,6 +48,8 @@
             'gunning_fog_index',
             'flesch_kincaid_index',
             'smog_index',
+            'automated_readability_index',
+            'coleman_liau_index',
             'average_syllables',
             'syllable_variance',
             'syllable_skew',
diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py
index eaa5302..14a4bdb 100644
--- a/trufflepig/preprocessing.py
+++ b/trufflepig/preprocessing.py
@@ -337,8 +337,9 @@ def preprocess(post_df, ncores=4, chunksize=500,
     post_df['unique_words'] = post_df.tokens.apply(lambda x: len(set(x)))
     post_df['unique_ratio'] = post_df.unique_words / post_df.num_words
 
-    logger.info('Computing characters per word')
-    post_df['chars_per_word'] = post_df.body_length / post_df.num_words
+    logger.info('Computing characters and characters per word')
+    post_df['num_chars'] = post_df.tokens.apply(lambda x: tfsm.count_characters(x))
+    post_df['chars_per_word'] = post_df.num_chars / post_df.num_words
 
     logger.info('Counting connectors')
     post_df['num_connectors'] = post_df.tokens.apply(lambda x: tfsm.count_connectors(x))
@@ -365,6 +366,8 @@ def preprocess(post_df, ncores=4, chunksize=500,
     post_df['syllable_variance'] = post_df.token_syllables.apply(lambda x: np.var(x))
     post_df['syllable_skew'] = post_df.token_syllables.apply(lambda x: spst.skew(x))
     post_df['syllable_kurtosis'] = post_df.token_syllables.apply(lambda x: spst.kurtosis(x))
+
+    logger.info('Computing readability indices')
     post_df['gunning_fog_index'] = tfsm.gunning_fog_index(num_words=post_df.num_words,
                                                         num_complex_words=post_df.num_complex_words,
                                                         num_sentences=post_df.num_sentences)
@@ -373,6 +376,12 @@ def preprocess(post_df, ncores=4, chunksize=500,
                                                               num_sentences=post_df.num_sentences)
     post_df['smog_index']= tfsm.smog_index(num_complex_words=post_df.num_complex_words,
                                          num_sentences=post_df.num_sentences)
+    post_df['automated_readability_index'] = tfsm.automated_readability_index(num_chars=post_df.num_chars,
+                                                                        num_words=post_df.num_words,
+                                                                        num_sentences=post_df.num_sentences)
+    post_df['coleman_liau_index'] = tfsm.coleman_liau_index(num_chars=post_df.num_chars,
+                                                            num_words=post_df.num_words,
+                                                            num_sentences=post_df.num_sentences)
 
     post_df.dropna(inplace=True)
     logger.info('Final data set has {} shape'.format(post_df.shape))

From bc28a89a44f38fcf6e1537edc7932df33995467b Mon Sep 17 00:00:00 2001
From: Robert Meyer <robert.meyer@flixbus.com>
Date: Sat, 10 Mar 2018 07:21:15 +0100
Subject: [PATCH 2/3] New linter errors

---
 CHANGELOG.md                        | 3 ++-
 requirements.txt                    | 1 +
 tests/filters/stylemeasures_test.py | 6 ++++++
 trufflepig/filters/stylemeasures.py | 6 ++++++
 trufflepig/model.py                 | 3 ++-
 trufflepig/preprocessing.py         | 7 +++++++
 6 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8873039..76ee0e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
-*0.6.0a* - 2018-03-08
+*0.6.0a* - 2018-03-10
 
 * Two new readability scores
+* New lint errors
 
 *0.5.0a* - 2018-03-07
 
diff --git a/requirements.txt b/requirements.txt
index 067d95d..b0f24d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,5 +11,6 @@ scikit-learn==0.19.1
 langdetect==1.0.7
 pyenchant==2.0.0
 pyphen==0.9.4
+proselint==0.8.0
 language-check==1.1
 steem==0.18.103
\ No newline at end of file
diff --git a/tests/filters/stylemeasures_test.py b/tests/filters/stylemeasures_test.py
index 8005415..94ef2ea 100644
--- a/tests/filters/stylemeasures_test.py
+++ b/tests/filters/stylemeasures_test.py
@@ -113,6 +113,12 @@ def test_adverb_estimate():
     assert result == 1
 
 
+def test_lint_errors():
+    text = 'She works hard for the money. So hard! John is very unique.'
+    result = tpsm.lint_errors(text)
+    assert result == 2
+
+
 def test_grammar_errors():
     counter = tpsm.GrammarErrorCounter()
     sentences = 'She earn moneyt. I did nothing wrogn. He go in Colorado.'
diff --git a/trufflepig/filters/stylemeasures.py b/trufflepig/filters/stylemeasures.py
index 33b92b2..2d7ad52 100644
--- a/trufflepig/filters/stylemeasures.py
+++ b/trufflepig/filters/stylemeasures.py
@@ -6,6 +6,7 @@
 import langdetect
 import language_check
 import pyphen
+import proselint
 from enchant.checker import SpellChecker
 
 CAPS = "([A-Z])"
@@ -171,6 +172,11 @@ def adverb_estimate(tokens):
     return sum([1 for x in tokens if x.endswith('ly')])
 
 
+def lint_errors(text):
+    errors = proselint.tools.lint(text)
+    return len(errors)
+
+
 class SyllableConverter(object):
     def __init__(self, language='en'):
         self.dic = pyphen.Pyphen(lang=language)
diff --git a/trufflepig/model.py b/trufflepig/model.py
index 0747c8a..1d3198f 100644
--- a/trufflepig/model.py
+++ b/trufflepig/model.py
@@ -54,7 +54,8 @@
             'syllable_variance',
             'syllable_skew',
             'syllable_kurtosis',
-            'adverbs_per_sentence']
+            'adverbs_per_sentence',
+            'lint_errors_per_sentence']
 
 # output variables for regressor
 TARGETS = ['reward', 'votes']
diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py
index 14a4bdb..99e5b56 100644
--- a/trufflepig/preprocessing.py
+++ b/trufflepig/preprocessing.py
@@ -383,6 +383,13 @@ def preprocess(post_df, ncores=4, chunksize=500,
                                                             num_words=post_df.num_words,
                                                             num_sentences=post_df.num_sentences)
 
+    logger.info('Counting proselint errors')
+    post_df['lint_errors'] = apply_parallel(tfsm.lint_errors,
+                                              post_df.filtered_body,
+                                              ncores=ncores,
+                                              chunksize=chunksize)
+    post_df['lint_errors_per_sentence'] = post_df.lint_errors / post_df.num_sentences
+
     post_df.dropna(inplace=True)
     logger.info('Final data set has {} shape'.format(post_df.shape))
 

From 365004f7b3d0eef96131f99b3c7c938a66257dec Mon Sep 17 00:00:00 2001
From: Robert Meyer <robert.meyer@flixbus.com>
Date: Sat, 10 Mar 2018 07:51:37 +0100
Subject: [PATCH 3/3] Removed linting again due to errors

---
 CHANGELOG.md                        | 1 -
 requirements.txt                    | 1 -
 tests/filters/stylemeasures_test.py | 6 ------
 trufflepig/filters/stylemeasures.py | 6 ------
 trufflepig/model.py                 | 3 +--
 trufflepig/preprocessing.py         | 7 -------
 6 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76ee0e1..06e1431 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,6 @@
 *0.6.0a* - 2018-03-10
 
 * Two new readability scores
-* New lint errors
 
 *0.5.0a* - 2018-03-07
 
diff --git a/requirements.txt b/requirements.txt
index b0f24d1..067d95d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,5 @@ scikit-learn==0.19.1
 langdetect==1.0.7
 pyenchant==2.0.0
 pyphen==0.9.4
-proselint==0.8.0
 language-check==1.1
 steem==0.18.103
\ No newline at end of file
diff --git a/tests/filters/stylemeasures_test.py b/tests/filters/stylemeasures_test.py
index 94ef2ea..8005415 100644
--- a/tests/filters/stylemeasures_test.py
+++ b/tests/filters/stylemeasures_test.py
@@ -113,12 +113,6 @@ def test_adverb_estimate():
     assert result == 1
 
 
-def test_lint_errors():
-    text = 'She works hard for the money. So hard! John is very unique.'
-    result = tpsm.lint_errors(text)
-    assert result == 2
-
-
 def test_grammar_errors():
     counter = tpsm.GrammarErrorCounter()
     sentences = 'She earn moneyt. I did nothing wrogn. He go in Colorado.'
diff --git a/trufflepig/filters/stylemeasures.py b/trufflepig/filters/stylemeasures.py
index 2d7ad52..33b92b2 100644
--- a/trufflepig/filters/stylemeasures.py
+++ b/trufflepig/filters/stylemeasures.py
@@ -6,7 +6,6 @@
 import langdetect
 import language_check
 import pyphen
-import proselint
 from enchant.checker import SpellChecker
 
 CAPS = "([A-Z])"
@@ -172,11 +171,6 @@ def adverb_estimate(tokens):
     return sum([1 for x in tokens if x.endswith('ly')])
 
 
-def lint_errors(text):
-    errors = proselint.tools.lint(text)
-    return len(errors)
-
-
 class SyllableConverter(object):
     def __init__(self, language='en'):
         self.dic = pyphen.Pyphen(lang=language)
diff --git a/trufflepig/model.py b/trufflepig/model.py
index 1d3198f..0747c8a 100644
--- a/trufflepig/model.py
+++ b/trufflepig/model.py
@@ -54,8 +54,7 @@
             'syllable_variance',
             'syllable_skew',
             'syllable_kurtosis',
-            'adverbs_per_sentence',
-            'lint_errors_per_sentence']
+            'adverbs_per_sentence']
 
 # output variables for regressor
 TARGETS = ['reward', 'votes']
diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py
index 99e5b56..14a4bdb 100644
--- a/trufflepig/preprocessing.py
+++ b/trufflepig/preprocessing.py
@@ -383,13 +383,6 @@ def preprocess(post_df, ncores=4, chunksize=500,
                                                             num_words=post_df.num_words,
                                                             num_sentences=post_df.num_sentences)
 
-    logger.info('Counting proselint errors')
-    post_df['lint_errors'] = apply_parallel(tfsm.lint_errors,
-                                              post_df.filtered_body,
-                                              ncores=ncores,
-                                              chunksize=chunksize)
-    post_df['lint_errors_per_sentence'] = post_df.lint_errors / post_df.num_sentences
-
     post_df.dropna(inplace=True)
     logger.info('Final data set has {} shape'.format(post_df.shape))