stay out of the whale wars!

SmokinCaterpillar · Mar 2, 2018 · 51aa574 · 51aa574
1 parent ce3f376
commit 51aa574
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 2 deletions.
diff --git a/trufflepig/bchain/posts.py b/trufflepig/bchain/posts.py
@@ -88,7 +88,7 @@ def topN_post(topN_authors, topN_permalinks, topN_titles,
               truffle_link=TRUFFLE_LINK, truffle_image=TRUFFLE_IMAGE,
               quote_max_length=QUOTE_MAX_LENGTH):
     """Craetes the truffle pig daily toplist post"""
-    title = """The daily Top 10 Truffle Picks: Quality Steemit Posts that deserve more Attention! ({date})"""
+    title = """Today's Truffle Picks: Quality Steemit Posts that deserve more Rewards and Attention! ({date})"""
 
     post=""" ## Daily Truffle Picks
     

diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py
@@ -24,6 +24,10 @@
                'bible', 'faith', 'spiritual', 'christianity')
 
 
+# Stay out of the whale wars!
+FILTER_AUTHORS = ('haejin',)
+
+
 def filter_duplicates(frame):
     """ Filters out duplicate entries based on author and permalink
 
@@ -90,7 +94,8 @@ def preprocess(post_df, ncores=4, chunksize=500,
                max_errors_per_word=0.1,
                min_max_average_punctuation=(1.05, 5),
                min_max_average_sentence_length=(10, 300),
-               filter_tags=FILTER_TAGS):
+               filter_tags=FILTER_TAGS,
+               filter_authors=FILTER_AUTHORS):
     """ Preprocessing of raw steemit posts, filters and adds features
 
     All filtering happening inplace!
@@ -137,6 +142,8 @@ def preprocess(post_df, ncores=4, chunksize=500,
     filter_tags: tuple of string
         Tags to be filtered like 'sex', 'nsfw' or controversial stuff like
         'vaccines'.
+    filter_authors: tuple of string
+        Authors to be filtered...
 
     Returns
     -------
@@ -146,6 +153,13 @@ def preprocess(post_df, ncores=4, chunksize=500,
     logger.info('Filtering duplicates of {} posts'.format(len(post_df)))
     post_df = filter_duplicates(post_df)
 
+    logger.info('Filtering authors {}'.format(filter_authors))
+    filter_authors = set(filter_authors)
+    author_filter = post_df.author.apply(lambda x: x in filter_authors)
+    to_drop = post_df.loc[author_filter]
+    post_df.drop(to_drop.index, inplace=True)
+    logger.info('Kept {} posts'.format(len(post_df)))
+
     logger.info('Filtering tags {}'.format(filter_tags))
     filter_tags = set(filter_tags)
     tag_filter = post_df.tags.apply(lambda x: tftf.is_in_filter_tags(x, filter_tags))