Skip to content

Commit

Permalink
stay out of the whale wars!
Browse files Browse the repository at this point in the history
  • Loading branch information
Robert Meyer committed Mar 2, 2018
1 parent ce3f376 commit 51aa574
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 2 deletions.
2 changes: 1 addition & 1 deletion trufflepig/bchain/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def topN_post(topN_authors, topN_permalinks, topN_titles,
truffle_link=TRUFFLE_LINK, truffle_image=TRUFFLE_IMAGE,
quote_max_length=QUOTE_MAX_LENGTH):
"""Craetes the truffle pig daily toplist post"""
title = """The daily Top 10 Truffle Picks: Quality Steemit Posts that deserve more Attention! ({date})"""
title = """Today's Truffle Picks: Quality Steemit Posts that deserve more Rewards and Attention! ({date})"""

post=""" ## Daily Truffle Picks
Expand Down
16 changes: 15 additions & 1 deletion trufflepig/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
'bible', 'faith', 'spiritual', 'christianity')


# Stay out of the whale wars!
FILTER_AUTHORS = ('haejin',)


def filter_duplicates(frame):
""" Filters out duplicate entries based on author and permalink
Expand Down Expand Up @@ -90,7 +94,8 @@ def preprocess(post_df, ncores=4, chunksize=500,
max_errors_per_word=0.1,
min_max_average_punctuation=(1.05, 5),
min_max_average_sentence_length=(10, 300),
filter_tags=FILTER_TAGS):
filter_tags=FILTER_TAGS,
filter_authors=FILTER_AUTHORS):
""" Preprocessing of raw steemit posts, filters and adds features
All filtering happening inplace!
Expand Down Expand Up @@ -137,6 +142,8 @@ def preprocess(post_df, ncores=4, chunksize=500,
filter_tags: tuple of string
Tags to be filtered like 'sex', 'nsfw' or controversial stuff like
'vaccines'.
filter_authors: tuple of string
Authors to be filtered...
Returns
-------
Expand All @@ -146,6 +153,13 @@ def preprocess(post_df, ncores=4, chunksize=500,
logger.info('Filtering duplicates of {} posts'.format(len(post_df)))
post_df = filter_duplicates(post_df)

logger.info('Filtering authors {}'.format(filter_authors))
filter_authors = set(filter_authors)
author_filter = post_df.author.apply(lambda x: x in filter_authors)
to_drop = post_df.loc[author_filter]
post_df.drop(to_drop.index, inplace=True)
logger.info('Kept {} posts'.format(len(post_df)))

logger.info('Filtering tags {}'.format(filter_tags))
filter_tags = set(filter_tags)
tag_filter = post_df.tags.apply(lambda x: tftf.is_in_filter_tags(x, filter_tags))
Expand Down

0 comments on commit 51aa574

Please sign in to comment.