Merge pull request #12 from SmokinCaterpillar/debt/gz2sqlite

Moving from gz to sqlite and also filtering cheetah on preprocessing …
SmokinCaterpillar · Mar 26, 2018 · 7eb23ad · 7eb23ad
2 parents 264bd20 + 945e129
commit 7eb23ad
Show file tree

Hide file tree

Showing 16 changed files with 212 additions and 99 deletions.
diff --git a/integration_tests/bchain/getdata_test.py b/integration_tests/bchain/getdata_test.py
@@ -71,17 +71,3 @@ def test_scrape_recent_date(steem):
                                   stop_after=50,
                                   ncores=1)
     assert len(frame)
-
-
-def test_cheetah_exclusion(steem):
-    p = Post('@neuehorizonte/das-betrugmodell-unseres-'
-             'finanzsystem-und-der-ausweg-prof-franz-hrmann--azk-20180225t104415261z',
-             steem)
-    assert tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)
-
-
-def test_not_cheetah_exclusion(steem):
-    p = Post('@@smcaterpillar/trufflepig-introducing-the-artificial-'
-             'intelligence-for-content-curation-and-minnow-support',
-             steem)
-    assert not tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)
diff --git a/integration_tests/persist_test.py b/integration_tests/persist_test.py
@@ -0,0 +1,20 @@
+import os
+import pandas as pd
+
+from trufflepig.testutils.pytest_fixtures import temp_dir
+from trufflepig.testutils.random_data import create_n_random_posts
+import trufflepig.preprocessing as tppp
+import trufflepig.persist as tppe
+
+
+def test_store_load_frame_test(temp_dir):
+    filename = os.path.join(temp_dir, 'test.sqlite')
+
+    x = pd.DataFrame(create_n_random_posts(42))
+    x = tppp.preprocess(x)
+
+    tppe.to_sqlite(x, filename, 'test')
+
+    y = tppe.from_sqlite(filename, 'test')
+
+    pd.testing.assert_frame_equal(x,y)
diff --git a/integration_tests/preprocessing_test.py b/integration_tests/preprocessing_test.py
@@ -7,6 +7,7 @@
 import trufflepig.bchain.getdata as tpgd
 from trufflepig.testutils.random_data import create_n_random_posts
 from trufflepig.testutils.pytest_fixtures import temp_dir, steem
+import trufflepig.bchain.getaccountdata as tpac
 
 
 def test_load_or_preproc(temp_dir):
@@ -49,3 +50,19 @@ def test_load_or_preproc_with_real_data(steem, temp_dir):
 
     assert len(os.listdir(temp_dir)) == 1
     assert_frame_equal(frame, frame2)
+
+
+def test_bid_bot_correction_real_data(steem):
+    min_datetime = pd.datetime.utcnow() - pd.Timedelta(days=14)
+    max_datetime = min_datetime + pd.Timedelta(days=13)
+    upvotes = tpac.get_upvote_payments('brittuf', steem, min_datetime,
+                                      max_datetime)
+
+    author, permalink = list(upvotes.keys())[0]
+    data = tpgd.get_post_data([(author, permalink)], steem)
+    df = pd.DataFrame(data)
+
+    tppp.compute_bidbot_correction(df, upvotes)
+
+    assert upvotes
+    assert (df.sbd_bought_reward.mean() > 0) or (df.steem_bought_reward.mean() > 0)
diff --git a/tests/filters/textfilters_test.py b/tests/filters/textfilters_test.py
@@ -55,6 +55,10 @@ def test_is_in_tags_typerror():
     assert result
 
 
+def test_voted_by():
+    assert tptf.voted_by([{'voter': 'cheetah'}], {'cheetah'})
+
+
 def test_filter_headdings():
     text= """# heading nheadings
 heyho

diff --git a/tests/preprocessing_test.py b/tests/preprocessing_test.py
@@ -7,8 +7,9 @@
 
 def test_preprocessing():
     post_frame = pd.DataFrame(POSTS)
-    filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.8,
-                               max_errors_per_word=0.5)
+    filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.5,
+                               max_errors_per_word=0.5,
+                               min_max_num_words=(10, 99999))
 
     assert len(filtered)
 
@@ -17,7 +18,8 @@ def test_preprocessing_parallel():
     post_frame = pd.DataFrame([POSTS[0] for _ in range(100)])
     post_frame['permalink'] = ['kkk'+str(irun % 50) for irun in range(100)]
     filtered = tppp.preprocess(post_frame, ncores=5, chunksize=20,
-                               min_en_prob=0.8, max_errors_per_word=0.5)
+                               min_en_prob=0.5, max_errors_per_word=0.5,
+                               min_max_num_words=(10, 99999))
 
     assert len(filtered) > 40
 
@@ -26,9 +28,10 @@ def test_preprocessing_random_parallel():
     posts = create_n_random_posts(50)
     post_frame = pd.DataFrame(posts)
     filtered = tppp.preprocess(post_frame, ncores=5, chunksize=10,
-                               min_en_prob=0.8, max_errors_per_word=0.5)
+                               min_en_prob=0.5, max_errors_per_word=0.5,
+                               min_max_num_words=(10, 99999))
 
-    assert len(filtered) > 30
+    assert len(filtered) > 20
 
 
 def test_bid_bot_correction():
@@ -39,7 +42,8 @@ def test_bid_bot_correction():
     bought[('hello', 'kitty')] = ['19 STEEM']
     sample_frame = post_frame[['author', 'permalink']].sample(10)
     for _, (author, permalink) in sample_frame.iterrows():
-        bought[(author, permalink)] = {'aaa':'3 STEEM', 'bbb': '4 SBD'}
+        bought[(author, permalink)] = {'aaa':{'amount': '3 STEEM'},
+                                       'bbb': {'amount': '4 SBD'}}
 
     post_frame = tppp.compute_bidbot_correction(post_frame,
                                                 bought)

diff --git a/trufflepig/bchain/getaccountdata.py b/trufflepig/bchain/getaccountdata.py
@@ -213,6 +213,8 @@ def get_upvote_payments(account, steem, min_datetime, max_datetime,
     for transfer in transfers:
         try:
             memo = transfer['memo']
+            timestamp = pd.to_datetime(transfer['timestamp'])
+
             if memo.startswith(MEMO_START):
                 author, permalink = memo.split('/')[-2:]
                 if author.startswith('@'):
@@ -221,9 +223,15 @@ def get_upvote_payments(account, steem, min_datetime, max_datetime,
                         upvote_payments[(author, permalink)] = {}
                     trx_id = transfer['trx_id']
                     amount = transfer['amount']
-                    upvote_payments[(author, permalink)][trx_id] = amount
 
-            timestamp = pd.to_datetime(transfer['timestamp'])
+                    transaction_dict = dict(
+                            timestamp=timestamp,
+                            amount=amount,
+                            payer=transfer['from'],
+                            payee=transfer['to']
+                        )
+                    upvote_payments[(author, permalink)][trx_id] = transaction_dict
+
             if timestamp < min_datetime:
                 break