Skip to content

Commit

Permalink
Merge pull request #12 from SmokinCaterpillar/debt/gz2sqlite
Browse files Browse the repository at this point in the history
Moving from gz to sqlite and also filtering cheetah on preprocessing …
  • Loading branch information
SmokinCaterpillar committed Mar 26, 2018
2 parents 264bd20 + 945e129 commit 7eb23ad
Show file tree
Hide file tree
Showing 16 changed files with 212 additions and 99 deletions.
14 changes: 0 additions & 14 deletions integration_tests/bchain/getdata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,3 @@ def test_scrape_recent_date(steem):
stop_after=50,
ncores=1)
assert len(frame)


def test_cheetah_exclusion(steem):
p = Post('@neuehorizonte/das-betrugmodell-unseres-'
'finanzsystem-und-der-ausweg-prof-franz-hrmann--azk-20180225t104415261z',
steem)
assert tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)


def test_not_cheetah_exclusion(steem):
p = Post('@@smcaterpillar/trufflepig-introducing-the-artificial-'
'intelligence-for-content-curation-and-minnow-support',
steem)
assert not tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)
20 changes: 20 additions & 0 deletions integration_tests/persist_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
import pandas as pd

from trufflepig.testutils.pytest_fixtures import temp_dir
from trufflepig.testutils.random_data import create_n_random_posts
import trufflepig.preprocessing as tppp
import trufflepig.persist as tppe


def test_store_load_frame_test(temp_dir):
filename = os.path.join(temp_dir, 'test.sqlite')

x = pd.DataFrame(create_n_random_posts(42))
x = tppp.preprocess(x)

tppe.to_sqlite(x, filename, 'test')

y = tppe.from_sqlite(filename, 'test')

pd.testing.assert_frame_equal(x,y)
17 changes: 17 additions & 0 deletions integration_tests/preprocessing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import trufflepig.bchain.getdata as tpgd
from trufflepig.testutils.random_data import create_n_random_posts
from trufflepig.testutils.pytest_fixtures import temp_dir, steem
import trufflepig.bchain.getaccountdata as tpac


def test_load_or_preproc(temp_dir):
Expand Down Expand Up @@ -49,3 +50,19 @@ def test_load_or_preproc_with_real_data(steem, temp_dir):

assert len(os.listdir(temp_dir)) == 1
assert_frame_equal(frame, frame2)


def test_bid_bot_correction_real_data(steem):
min_datetime = pd.datetime.utcnow() - pd.Timedelta(days=14)
max_datetime = min_datetime + pd.Timedelta(days=13)
upvotes = tpac.get_upvote_payments('brittuf', steem, min_datetime,
max_datetime)

author, permalink = list(upvotes.keys())[0]
data = tpgd.get_post_data([(author, permalink)], steem)
df = pd.DataFrame(data)

tppp.compute_bidbot_correction(df, upvotes)

assert upvotes
assert (df.sbd_bought_reward.mean() > 0) or (df.steem_bought_reward.mean() > 0)
4 changes: 4 additions & 0 deletions tests/filters/textfilters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ def test_is_in_tags_typerror():
assert result


def test_voted_by():
assert tptf.voted_by([{'voter': 'cheetah'}], {'cheetah'})


def test_filter_headdings():
text= """# heading nheadings
heyho
Expand Down
16 changes: 10 additions & 6 deletions tests/preprocessing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

def test_preprocessing():
post_frame = pd.DataFrame(POSTS)
filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.8,
max_errors_per_word=0.5)
filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.5,
max_errors_per_word=0.5,
min_max_num_words=(10, 99999))

assert len(filtered)

Expand All @@ -17,7 +18,8 @@ def test_preprocessing_parallel():
post_frame = pd.DataFrame([POSTS[0] for _ in range(100)])
post_frame['permalink'] = ['kkk'+str(irun % 50) for irun in range(100)]
filtered = tppp.preprocess(post_frame, ncores=5, chunksize=20,
min_en_prob=0.8, max_errors_per_word=0.5)
min_en_prob=0.5, max_errors_per_word=0.5,
min_max_num_words=(10, 99999))

assert len(filtered) > 40

Expand All @@ -26,9 +28,10 @@ def test_preprocessing_random_parallel():
posts = create_n_random_posts(50)
post_frame = pd.DataFrame(posts)
filtered = tppp.preprocess(post_frame, ncores=5, chunksize=10,
min_en_prob=0.8, max_errors_per_word=0.5)
min_en_prob=0.5, max_errors_per_word=0.5,
min_max_num_words=(10, 99999))

assert len(filtered) > 30
assert len(filtered) > 20


def test_bid_bot_correction():
Expand All @@ -39,7 +42,8 @@ def test_bid_bot_correction():
bought[('hello', 'kitty')] = ['19 STEEM']
sample_frame = post_frame[['author', 'permalink']].sample(10)
for _, (author, permalink) in sample_frame.iterrows():
bought[(author, permalink)] = {'aaa':'3 STEEM', 'bbb': '4 SBD'}
bought[(author, permalink)] = {'aaa':{'amount': '3 STEEM'},
'bbb': {'amount': '4 SBD'}}

post_frame = tppp.compute_bidbot_correction(post_frame,
bought)
Expand Down
12 changes: 10 additions & 2 deletions trufflepig/bchain/getaccountdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ def get_upvote_payments(account, steem, min_datetime, max_datetime,
for transfer in transfers:
try:
memo = transfer['memo']
timestamp = pd.to_datetime(transfer['timestamp'])

if memo.startswith(MEMO_START):
author, permalink = memo.split('/')[-2:]
if author.startswith('@'):
Expand All @@ -221,9 +223,15 @@ def get_upvote_payments(account, steem, min_datetime, max_datetime,
upvote_payments[(author, permalink)] = {}
trx_id = transfer['trx_id']
amount = transfer['amount']
upvote_payments[(author, permalink)][trx_id] = amount

timestamp = pd.to_datetime(transfer['timestamp'])
transaction_dict = dict(
timestamp=timestamp,
amount=amount,
payer=transfer['from'],
payee=transfer['to']
)
upvote_payments[(author, permalink)][trx_id] = transaction_dict

if timestamp < min_datetime:
break

Expand Down

0 comments on commit 7eb23ad

Please sign in to comment.