Skip to content

Commit

Permalink
Merge pull request #9 from SmokinCaterpillar/feature/add_quality_meas…
Browse files Browse the repository at this point in the history
…ures

Feature/add quality measures
  • Loading branch information
SmokinCaterpillar committed Mar 11, 2018
2 parents e50c5b2 + 365004f commit 95fe7fe
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
*0.6.0a* - 2018-03-10

* Two new readability scores

*0.5.0a* - 2018-03-07

* Fixed tfids error
Expand Down
15 changes: 15 additions & 0 deletions tests/filters/stylemeasures_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ def test_count_pronouns():
assert result == 2


def test_count_characters():
result = tpsm.count_characters(['hi', 'mark'])
assert result == 6


def test_count_letters():
result = tpsm.count_letters('hallo54 my namE!!')
assert result == 11
Expand Down Expand Up @@ -93,6 +98,16 @@ def test_flesch_kincaid_index():
assert result == 77.90500000000002


def test_automated_readability_index():
result = tpsm.automated_readability_index(1000, 100, 10)
assert result == 30.67


def test_coleman_liau_index():
result = tpsm.coleman_liau_index(1000, 100, 10)
assert result == 40.03999999999999


def test_adverb_estimate():
result = tpsm.adverb_estimate(['i', 'am', 'heavily', 'in', 'use'])
assert result == 1
Expand Down
15 changes: 15 additions & 0 deletions trufflepig/filters/stylemeasures.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ def compute_average_puncitation(text_list):
return np.mean(punctuation)


def count_characters(tokens):
return sum(len(x) for x in tokens)


def count_connectors(tokens):
return sum(1 for x in tokens if x in CONNECTORS)

Expand All @@ -149,9 +153,20 @@ def flesch_kincaid_index(num_syllables, num_words, num_sentences):


def smog_index(num_complex_words, num_sentences):
"""https://en.wikipedia.org/wiki/SMOG"""
return 1.0430 * np.sqrt(num_complex_words * 30 / num_sentences) + 3.1291


def automated_readability_index(num_chars, num_words, num_sentences):
"""https://en.wikipedia.org/wiki/Automated_readability_index"""
return 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43


def coleman_liau_index(num_chars, num_words, num_sentences):
"""https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index"""
return 0.0588 * 100 * num_chars / num_words - 0.296 * 100 * num_sentences / num_words - 15.8


def adverb_estimate(tokens):
return sum([1 for x in tokens if x.endswith('ly')])

Expand Down
2 changes: 2 additions & 0 deletions trufflepig/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
'gunning_fog_index',
'flesch_kincaid_index',
'smog_index',
'automated_readability_index',
'coleman_liau_index',
'average_syllables',
'syllable_variance',
'syllable_skew',
Expand Down
13 changes: 11 additions & 2 deletions trufflepig/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,8 +337,9 @@ def preprocess(post_df, ncores=4, chunksize=500,
post_df['unique_words'] = post_df.tokens.apply(lambda x: len(set(x)))
post_df['unique_ratio'] = post_df.unique_words / post_df.num_words

logger.info('Computing characters per word')
post_df['chars_per_word'] = post_df.body_length / post_df.num_words
logger.info('Computing characters and characters per word')
post_df['num_chars'] = post_df.tokens.apply(lambda x: tfsm.count_characters(x))
post_df['chars_per_word'] = post_df.num_chars / post_df.num_words

logger.info('Counting connectors')
post_df['num_connectors'] = post_df.tokens.apply(lambda x: tfsm.count_connectors(x))
Expand All @@ -365,6 +366,8 @@ def preprocess(post_df, ncores=4, chunksize=500,
post_df['syllable_variance'] = post_df.token_syllables.apply(lambda x: np.var(x))
post_df['syllable_skew'] = post_df.token_syllables.apply(lambda x: spst.skew(x))
post_df['syllable_kurtosis'] = post_df.token_syllables.apply(lambda x: spst.kurtosis(x))

logger.info('Computing readability indices')
post_df['gunning_fog_index'] = tfsm.gunning_fog_index(num_words=post_df.num_words,
num_complex_words=post_df.num_complex_words,
num_sentences=post_df.num_sentences)
Expand All @@ -373,6 +376,12 @@ def preprocess(post_df, ncores=4, chunksize=500,
num_sentences=post_df.num_sentences)
post_df['smog_index']= tfsm.smog_index(num_complex_words=post_df.num_complex_words,
num_sentences=post_df.num_sentences)
post_df['automated_readability_index'] = tfsm.automated_readability_index(num_chars=post_df.num_chars,
num_words=post_df.num_words,
num_sentences=post_df.num_sentences)
post_df['coleman_liau_index'] = tfsm.coleman_liau_index(num_chars=post_df.num_chars,
num_words=post_df.num_words,
num_sentences=post_df.num_sentences)

post_df.dropna(inplace=True)
logger.info('Final data set has {} shape'.format(post_df.shape))
Expand Down

0 comments on commit 95fe7fe

Please sign in to comment.