Merge pull request #9 from SmokinCaterpillar/feature/add_quality_meas…

…ures Feature/add quality measures
SmokinCaterpillar · Mar 11, 2018 · 95fe7fe · 95fe7fe
2 parents e50c5b2 + 365004f
commit 95fe7fe
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+*0.6.0a* - 2018-03-10
+
+* Two new readability scores
+
 *0.5.0a* - 2018-03-07
 
 * Fixed tfids error

diff --git a/tests/filters/stylemeasures_test.py b/tests/filters/stylemeasures_test.py
@@ -66,6 +66,11 @@ def test_count_pronouns():
     assert result == 2
 
 
+def test_count_characters():
+    result = tpsm.count_characters(['hi', 'mark'])
+    assert result == 6
+
+
 def test_count_letters():
     result = tpsm.count_letters('hallo54 my namE!!')
     assert result == 11
@@ -93,6 +98,16 @@ def test_flesch_kincaid_index():
     assert result == 77.90500000000002
 
 
+def test_automated_readability_index():
+    result = tpsm.automated_readability_index(1000, 100, 10)
+    assert result == 30.67
+
+
+def test_coleman_liau_index():
+    result = tpsm.coleman_liau_index(1000, 100, 10)
+    assert result == 40.03999999999999
+
+
 def test_adverb_estimate():
     result = tpsm.adverb_estimate(['i', 'am', 'heavily', 'in', 'use'])
     assert result == 1

diff --git a/trufflepig/filters/stylemeasures.py b/trufflepig/filters/stylemeasures.py
@@ -126,6 +126,10 @@ def compute_average_puncitation(text_list):
     return np.mean(punctuation)
 
 
+def count_characters(tokens):
+    return sum(len(x) for x in tokens)
+
+
 def count_connectors(tokens):
     return sum(1 for x in tokens if x in CONNECTORS)
 
@@ -149,9 +153,20 @@ def flesch_kincaid_index(num_syllables, num_words, num_sentences):
 
 
 def smog_index(num_complex_words, num_sentences):
+    """https://en.wikipedia.org/wiki/SMOG"""
     return 1.0430 * np.sqrt(num_complex_words * 30 / num_sentences) + 3.1291
 
 
+def automated_readability_index(num_chars, num_words, num_sentences):
+    """https://en.wikipedia.org/wiki/Automated_readability_index"""
+    return 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43
+
+
+def coleman_liau_index(num_chars, num_words, num_sentences):
+    """https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index"""
+    return 0.0588 * 100 * num_chars / num_words - 0.296 * 100 * num_sentences / num_words - 15.8
+
+
 def adverb_estimate(tokens):
     return sum([1 for x in tokens if x.endswith('ly')])
 

diff --git a/trufflepig/model.py b/trufflepig/model.py
@@ -48,6 +48,8 @@
             'gunning_fog_index',
             'flesch_kincaid_index',
             'smog_index',
+            'automated_readability_index',
+            'coleman_liau_index',
             'average_syllables',
             'syllable_variance',
             'syllable_skew',

diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py
@@ -337,8 +337,9 @@ def preprocess(post_df, ncores=4, chunksize=500,
     post_df['unique_words'] = post_df.tokens.apply(lambda x: len(set(x)))
     post_df['unique_ratio'] = post_df.unique_words / post_df.num_words
 
-    logger.info('Computing characters per word')
-    post_df['chars_per_word'] = post_df.body_length / post_df.num_words
+    logger.info('Computing characters and characters per word')
+    post_df['num_chars'] = post_df.tokens.apply(lambda x: tfsm.count_characters(x))
+    post_df['chars_per_word'] = post_df.num_chars / post_df.num_words
 
     logger.info('Counting connectors')
     post_df['num_connectors'] = post_df.tokens.apply(lambda x: tfsm.count_connectors(x))
@@ -365,6 +366,8 @@ def preprocess(post_df, ncores=4, chunksize=500,
     post_df['syllable_variance'] = post_df.token_syllables.apply(lambda x: np.var(x))
     post_df['syllable_skew'] = post_df.token_syllables.apply(lambda x: spst.skew(x))
     post_df['syllable_kurtosis'] = post_df.token_syllables.apply(lambda x: spst.kurtosis(x))
+
+    logger.info('Computing readability indices')
     post_df['gunning_fog_index'] = tfsm.gunning_fog_index(num_words=post_df.num_words,
                                                         num_complex_words=post_df.num_complex_words,
                                                         num_sentences=post_df.num_sentences)
@@ -373,6 +376,12 @@ def preprocess(post_df, ncores=4, chunksize=500,
                                                               num_sentences=post_df.num_sentences)
     post_df['smog_index']= tfsm.smog_index(num_complex_words=post_df.num_complex_words,
                                          num_sentences=post_df.num_sentences)
+    post_df['automated_readability_index'] = tfsm.automated_readability_index(num_chars=post_df.num_chars,
+                                                                        num_words=post_df.num_words,
+                                                                        num_sentences=post_df.num_sentences)
+    post_df['coleman_liau_index'] = tfsm.coleman_liau_index(num_chars=post_df.num_chars,
+                                                            num_words=post_df.num_words,
+                                                            num_sentences=post_df.num_sentences)
 
     post_df.dropna(inplace=True)
     logger.info('Final data set has {} shape'.format(post_df.shape))