Merge f930b1a into 264bd20

SmokinCaterpillar · Mar 24, 2018 · 8a8b702 · 8a8b702
2 parents 264bd20 + f930b1a
commit 8a8b702
Show file tree

Hide file tree

Showing 12 changed files with 173 additions and 92 deletions.
diff --git a/integration_tests/bchain/getdata_test.py b/integration_tests/bchain/getdata_test.py
@@ -71,17 +71,3 @@ def test_scrape_recent_date(steem):
                                   stop_after=50,
                                   ncores=1)
     assert len(frame)
-
-
-def test_cheetah_exclusion(steem):
-    p = Post('@neuehorizonte/das-betrugmodell-unseres-'
-             'finanzsystem-und-der-ausweg-prof-franz-hrmann--azk-20180225t104415261z',
-             steem)
-    assert tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)
-
-
-def test_not_cheetah_exclusion(steem):
-    p = Post('@@smcaterpillar/trufflepig-introducing-the-artificial-'
-             'intelligence-for-content-curation-and-minnow-support',
-             steem)
-    assert not tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)
diff --git a/integration_tests/persist_test.py b/integration_tests/persist_test.py
@@ -0,0 +1,20 @@
+import os
+import pandas as pd
+
+from trufflepig.testutils.pytest_fixtures import temp_dir
+from trufflepig.testutils.random_data import create_n_random_posts
+import trufflepig.preprocessing as tppp
+import trufflepig.persist as tppe
+
+
+def test_store_load_frame_test(temp_dir):
+    filename = os.path.join(temp_dir, 'test.sqlite')
+
+    x = pd.DataFrame(create_n_random_posts(42))
+    x = tppp.preprocess(x)
+
+    tppe.to_sqlite(x, filename, 'test')
+
+    y = tppe.from_sqlite(filename, 'test')
+
+    pd.testing.assert_frame_equal(x,y)
diff --git a/tests/filters/textfilters_test.py b/tests/filters/textfilters_test.py
@@ -55,6 +55,10 @@ def test_is_in_tags_typerror():
     assert result
 
 
+def test_voted_by():
+    assert tptf.voted_by([{'voter': 'cheetah'}], {'cheetah'})
+
+
 def test_filter_headdings():
     text= """# heading nheadings
 heyho

diff --git a/tests/preprocessing_test.py b/tests/preprocessing_test.py
@@ -7,8 +7,9 @@
 
 def test_preprocessing():
     post_frame = pd.DataFrame(POSTS)
-    filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.8,
-                               max_errors_per_word=0.5)
+    filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.5,
+                               max_errors_per_word=0.5,
+                               min_max_num_words=(10, 99999))
 
     assert len(filtered)
 
@@ -17,7 +18,8 @@ def test_preprocessing_parallel():
     post_frame = pd.DataFrame([POSTS[0] for _ in range(100)])
     post_frame['permalink'] = ['kkk'+str(irun % 50) for irun in range(100)]
     filtered = tppp.preprocess(post_frame, ncores=5, chunksize=20,
-                               min_en_prob=0.8, max_errors_per_word=0.5)
+                               min_en_prob=0.5, max_errors_per_word=0.5,
+                               min_max_num_words=(10, 99999))
 
     assert len(filtered) > 40
 
@@ -26,9 +28,10 @@ def test_preprocessing_random_parallel():
     posts = create_n_random_posts(50)
     post_frame = pd.DataFrame(posts)
     filtered = tppp.preprocess(post_frame, ncores=5, chunksize=10,
-                               min_en_prob=0.8, max_errors_per_word=0.5)
+                               min_en_prob=0.5, max_errors_per_word=0.5,
+                               min_max_num_words=(10, 99999))
 
-    assert len(filtered) > 30
+    assert len(filtered) > 20
 
 
 def test_bid_bot_correction():

diff --git a/trufflepig/bchain/getdata.py b/trufflepig/bchain/getdata.py
@@ -11,16 +11,17 @@
 from json import JSONDecodeError
 
 from trufflepig.utils import progressbar, none_retry, error_retry
+import trufflepig.persist as tppe
 
 
 logger = logging.getLogger(__name__)
 
 
 MIN_CHARACTERS = 500
 
-EXCLUSION_VOTERS_SET = {'cheetah'}
+FILENAME_TEMPLATE = 'steemit_posts__{time}.sqlite'
 
-FILENAME_TEMPLATE = 'steemit_posts__{time}.gz'
+TABLENAME = 'steemit_posts'
 
 
 ################################### Block Utils #################################
@@ -176,34 +177,13 @@ def extract_authors_and_permalinks(operations):
     return authors_and_permalinks
 
 
-def exclude_if_voted_by(active_votes, voted_by):
-    """ Checks if a post as been voted by someone like cheetah
-
-    Parameters
-    ----------
-    active_votes: list of dict
-        List of all votes cast
-    voted_by: set
-        Set of voters that lead to exclusion, e.g. cheetah
-
-    Returns
-    -------
-    bool if post should be excluded
-
-    """
-    in_exclusion = [vote['voter'] in voted_by for vote in active_votes]
-    return any(in_exclusion)
-
-
-def get_post_data(authors_and_permalinks, steem, exclusion_voters):
+def get_post_data(authors_and_permalinks, steem):
     """ Queries posts from `steem`
 
     Parameters
     ----------
     authors_and_permalinks: set of tuples of authors and permalink strings
     steem: Steem object
-    exclusion_voters: set
-        If post is voted by any of them it is excluded (cheetah list!)
 
     Returns
     -------
@@ -237,15 +217,14 @@ def get_post_data(authors_and_permalinks, steem, exclusion_voters):
             steem.reconnect()
             continue
 
-        if exclude_if_voted_by(p.active_votes, exclusion_voters):
-            logger.debug('Excluding {} by {} because voted by '
-                         '{}'.format(permalink, author, exclusion_voters))
-            continue
+        # Add positive votes and subtract negative
+        votes = sum(1 if x['percent'] > 0 else -1 for x in p.active_votes)
+        votes = max(votes, 0)
 
         post = {
             'title': p.title,
             'reward': p.reward.amount,
-            'votes': len([x for x in p.active_votes if x['percent'] > 0]),
+            'votes':votes,
             'active_votes': p.active_votes,
             'created': p.created,
             'tags': p.tags,
@@ -259,16 +238,13 @@ def get_post_data(authors_and_permalinks, steem, exclusion_voters):
 
 
 def get_all_posts_from_block(block_num, steem,
-                             exclusion_voters,
                              exclude_authors_and_permalinks=None):
     """ Gets all posts from one block
 
     Parameters
     ----------
     block_num: int
     steem: MPSteem
-    exclusion_voters: set
-        If post is voted by any of them it is excluded (cheetah list!)
     exclude_authors_and_permalinks: set of tuples of strings
         Exclude these authors and permalinks to get less duplicates
 
@@ -284,8 +260,7 @@ def get_all_posts_from_block(block_num, steem,
             if exclude_authors_and_permalinks:
                 authors_and_permalinks -= exclude_authors_and_permalinks
             if authors_and_permalinks:
-                return get_post_data(authors_and_permalinks, steem,
-                                     exclusion_voters), authors_and_permalinks
+                return get_post_data(authors_and_permalinks, steem), authors_and_permalinks
             else:
                 logger.debug('Could not find any posts for block {}'.format(block_num))
         else:
@@ -297,7 +272,6 @@ def get_all_posts_from_block(block_num, steem,
 
 
 def get_all_posts_between(start_datetime, end_datetime, steem,
-                          exclusion_voters=EXCLUSION_VOTERS_SET,
                           stop_after=None):
     """ Queries all posts found in blocks between start and end
 
@@ -306,8 +280,6 @@ def get_all_posts_between(start_datetime, end_datetime, steem,
     start_datetime: datetime
     end_datetime: datetime
     steem: Steem
-    exclusion_voters: set
-        If post is voted by any of them it is excluded (cheetah list!)
     stop_after: int or None
         For debugging and shorter tests, stop after only a few iterations
 
@@ -330,7 +302,6 @@ def get_all_posts_between(start_datetime, end_datetime, steem,
     for idx, block_num in enumerate(range(start_num, end_num+1)):
         posts_in_block, authors_and_permalinks = get_all_posts_from_block(block_num,
                                                                           steem,
-                                                                          exclusion_voters,
                                                                           exclude_authors_and_permalinks)
         exclude_authors_and_permalinks |= authors_and_permalinks
         posts.extend(posts_in_block)
@@ -350,15 +321,14 @@ def config_mp_logging(level=logging.INFO):
     logging.basicConfig(level=level)
 
 
-def _get_all_posts_for_blocks_parallel(block_nums, steem, exclusion_voters,
+def _get_all_posts_for_blocks_parallel(block_nums, steem,
                                        stop_after=None):
     """Helper wrapper for multiprocessing"""
     posts = []
     exclude_authors_and_permalinks = set()
     for block_num in block_nums:
         posts_in_block, authors_and_permalinks = get_all_posts_from_block(block_num,
                                                                           steem,
-                                                                          exclusion_voters,
                                                                           exclude_authors_and_permalinks)
         exclude_authors_and_permalinks |= authors_and_permalinks
         posts.extend(posts_in_block)
@@ -368,7 +338,6 @@ def _get_all_posts_for_blocks_parallel(block_nums, steem, exclusion_voters,
 
 
 def get_all_posts_between_parallel(start_datetime, end_datetime, steem,
-                                   exclusion_voters=EXCLUSION_VOTERS_SET,
                                    stop_after=None, ncores=8,
                                    chunksize=20, timeout=1200):
     """As above but in parallel with `ncores` jobs of `chunksize`.
@@ -395,7 +364,6 @@ def get_all_posts_between_parallel(start_datetime, end_datetime, steem,
     for idx, chunk in enumerate(chunks):
         result = pool.apply_async(_get_all_posts_for_blocks_parallel,
                                   args=(chunk, steem,
-                                        exclusion_voters,
                                         stop_after))
         async_results.append(result)
         if stop_after is not None and idx >= stop_after:
@@ -420,7 +388,6 @@ def get_all_posts_between_parallel(start_datetime, end_datetime, steem,
 
 
 def load_or_scrape_full_day(date, steem, directory,
-                            exclusion_voters=EXCLUSION_VOTERS_SET,
                             overwrite=False,
                             store=True, stop_after=None, ncores=1):
     """ Loads posts of a full day or queries them from steem blockchain
@@ -432,8 +399,6 @@ def load_or_scrape_full_day(date, steem, directory,
     steem:  Steem object
     directory: str
         Directory to load posts from
-    exclusion_voters: set
-        If post is voted by any of them it is excluded (cheetah list!)
     overwrite: bool
         If stored posts should be replaced
     store: bool
@@ -456,30 +421,30 @@ def load_or_scrape_full_day(date, steem, directory,
     filename = os.path.join(directory,filename)
     if os.path.isfile(filename) and not overwrite:
         logger.info('Found file {} will load it'.format(filename))
-        post_frame = pd.read_pickle(filename, compression='gzip')
+        post_frame = tppe.from_sqlite(filename=filename,
+                                      tablename=TABLENAME)
     else:
         logger.info('File {} not found, will start scraping'.format(filename))
 
         if ncores == 1:
             posts = get_all_posts_between(start_datetime, end_datetime, steem,
-                                          exclusion_voters=exclusion_voters,
                                           stop_after=stop_after)
         else:
             posts = get_all_posts_between_parallel(start_datetime, end_datetime,
                                                    steem,
-                                                   exclusion_voters=exclusion_voters,
                                                    stop_after=stop_after,
                                                    ncores=ncores)
 
         post_frame = pd.DataFrame(data=posts, columns=sorted(posts[0].keys()))
         if store:
             logger.info('Storing file {} to disk'.format(filename))
-            post_frame.to_pickle(filename, compression='gzip')
+            tppe.to_sqlite(post_frame,
+                           filename=filename,
+                           tablename=TABLENAME)
     return post_frame
 
 
 def load_or_scrape_training_data(steem, directory,
-                                 exclusion_voters=EXCLUSION_VOTERS_SET,
                                  days=20, offset_days=8,
                                  ncores=8,
                                  current_datetime=None,
@@ -491,8 +456,6 @@ def load_or_scrape_training_data(steem, directory,
     ----------
     steem:  Steem object
     directory: str
-    exclusion_voters: set
-        If post is voted by any of them it is excluded (cheetah list!)
     days: int
         Number of consecutive days to load or scrape
     offset_days: int
@@ -522,7 +485,6 @@ def load_or_scrape_training_data(steem, directory,
         next_date = (start_datetime + pd.Timedelta(days=day)).date()
         frame = load_or_scrape_full_day(next_date, steem,
                                         directory,
-                                        exclusion_voters=exclusion_voters,
                                         overwrite=False,
                                         store=store,
                                         stop_after=stop_after,
@@ -536,7 +498,6 @@ def load_or_scrape_training_data(steem, directory,
 
 
 def scrape_hour_data(steem,
-                     exclusion_voters=EXCLUSION_VOTERS_SET,
                      hours=24,
                      offset_hours=24,
                      current_datetime=None,
@@ -546,8 +507,6 @@ def scrape_hour_data(steem,
     Parameters
     ----------
     steem: Steem or kwargs
-    exclusion_voters: set
-        If post is voted by any of them it is excluded (cheetah list!)
     hours: int
         Number of consecutive hours to scrape
     offset_hours: int
@@ -574,13 +533,11 @@ def scrape_hour_data(steem,
         posts = get_all_posts_between(start_datetime,
                                       end_datetime,
                                       steem,
-                                      exclusion_voters=exclusion_voters,
                                       stop_after=stop_after)
     else:
         posts = get_all_posts_between_parallel(start_datetime,
                                                end_datetime,
                                                steem,
-                                               exclusion_voters=exclusion_voters,
                                                stop_after=stop_after,
                                                ncores=ncores)
 

diff --git a/trufflepig/bchain/postoncall.py b/trufflepig/bchain/postoncall.py
@@ -5,7 +5,9 @@
 
 import trufflepig.bchain.posts as tfbp
 import trufflepig.bchain.getdata as tfgd
+import trufflepig.filters.textfilters as tftf
 from trufflepig.utils import error_retry
+import trufflepig.preprocessing as tppp
 
 
 logger = logging.getLogger(__name__)
@@ -18,7 +20,7 @@
 
 def post_on_call(post_frame, account, steem, topN_link,
                  overview_permalink,
-                 exclusion_set=tfgd.EXCLUSION_VOTERS_SET,
+                 filter_voters=tppp.FILTER_VOTERS,
                  sleep_time=20.1):
     """ Replies to users calling @trufflepig
 
@@ -28,7 +30,7 @@ def post_on_call(post_frame, account, steem, topN_link,
     account: str
     steem: Steem object
     topN_link: str
-    exclusion_set: set of str
+    filter_voters: set of str
     sleep_time: float
         Bot can only post every 20 seconds,
         should only be lowered for debugging
@@ -45,8 +47,8 @@ def post_on_call(post_frame, account, steem, topN_link,
             comment.commit.no_broadcast = steem.commit.no_broadcast
             # Wait a bit Steemit nodes hate comments in quick succession
             time.sleep(sleep_time)
-            if not tfgd.exclude_if_voted_by(row.active_votes, {account}):
-                if row.passed and not tfgd.exclude_if_voted_by(row.active_votes, exclusion_set):
+            if not tftf.voted_by(row.active_votes, {account}):
+                if row.passed and not tftf.voted_by(row.active_votes, filter_voters):
 
                         logger.info('Voting and commenting on https://steemit.com/@{author}/{permalink}'
                                         ''.format(author=row.author, permalink=row.permalink))

diff --git a/trufflepig/filters/textfilters.py b/trufflepig/filters/textfilters.py
@@ -97,3 +97,6 @@ def is_in_filter_tags(tags, filter_tags):
         logger.exception('Could not identify tags {}, '
                          'will return True anyway'.format(tags))
         return True
+
+def voted_by(active_votes, voter_set):
+    return any(x['voter'] in voter_set for x in active_votes)