diff --git a/CHANGELOG.md b/CHANGELOG.md index 06e1431..38223f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ *0.6.0a* - 2018-03-10 * Two new readability scores +* New correction for many bibots and vote selling services +* Bid bot stats are listed in the weekly post *0.5.0a* - 2018-03-07 diff --git a/integration_tests/bchain/getaccountdata_test.py b/integration_tests/bchain/getaccountdata_test.py index 28ab578..2805dfd 100644 --- a/integration_tests/bchain/getaccountdata_test.py +++ b/integration_tests/bchain/getaccountdata_test.py @@ -27,3 +27,30 @@ def test_payouts(steem): assert 'smcaterpillar' in result assert 'trufflepig' not in result + + +def test_bidbot_test(steem): + min_datetime = pd.datetime.utcnow() - pd.Timedelta(days=14) + max_datetime = min_datetime + pd.Timedelta(days=13) + result = tpac.get_upvote_payments('brittuf', steem, min_datetime, + max_datetime) + assert result + + +def test_bidbot_test_max_time(steem): + min_datetime = pd.datetime.utcnow() - pd.Timedelta(days=14) + max_datetime = min_datetime + pd.Timedelta(days=13) + result = tpac.get_upvote_payments('brittuf', steem, min_datetime, + max_datetime, max_time=0.1) + assert len(result) <= 1 + + +def test_get_upvote_payments_for_accounts(steem_kwargs): + min_datetime = pd.datetime.utcnow() - pd.Timedelta(days=14) + max_datetime = min_datetime + pd.Timedelta(days=5) + accounts = ['trufflepig', 'smcaterpillar', 'brittuf'] + result = tpac.get_upvote_payments_for_accounts(accounts, + steem_kwargs, + min_datetime=min_datetime, + max_datetime=max_datetime) + assert result diff --git a/integration_tests/bchain/paydelegates_test.py b/integration_tests/bchain/paydelegates_test.py index a9e2d83..66e5241 100644 --- a/integration_tests/bchain/paydelegates_test.py +++ b/integration_tests/bchain/paydelegates_test.py @@ -1,4 +1,5 @@ import pytest +import pandas as pd from trufflepig.testutils.pytest_fixtures import steem_kwargs import trufflepig.bchain.paydelegates as tppd @@ -14,4 +15,5 @@ def test_pay_delegates(steem_kwargs): tppd.pay_delegates(account=config.ACCOUNT, steem_args=steem_kwargs, - current_datetime='2029-01-01') + current_datetime=pd.datetime.utcnow()#'2029-01-01' + ) diff --git a/integration_tests/bchain/postweeklyupdate_test.py b/integration_tests/bchain/postweeklyupdate_test.py index 312cea8..a8b4723 100644 --- a/integration_tests/bchain/postweeklyupdate_test.py +++ b/integration_tests/bchain/postweeklyupdate_test.py @@ -25,6 +25,11 @@ def test_statistics(): post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50) pipeline = tpmo.train_pipeline(post_frame, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs) + + post_frame['steem_bought_reward'] = 0 + post_frame['sbd_bought_reward'] = 0 + post_frame['bought_votes'] = 0 + stats = tppw.compute_weekly_statistics(post_frame, pipeline) steem_per_mvests = 490 @@ -62,6 +67,10 @@ def test_weekly_post(steem_kwargs): pipeline = tpmo.train_pipeline(post_frame, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs) + post_frame['steem_bought_reward'] = 0 + post_frame['sbd_bought_reward'] = 0 + post_frame['bought_votes'] = 0 + permalink = tppw.post_weakly_update(pipeline, post_frame, account=config.ACCOUNT, steem_args=steem_kwargs, diff --git a/integration_tests/preprocessing_test.py b/integration_tests/preprocessing_test.py index 0b63313..74a41c5 100644 --- a/integration_tests/preprocessing_test.py +++ b/integration_tests/preprocessing_test.py @@ -4,8 +4,9 @@ from pandas.testing import assert_frame_equal import trufflepig.preprocessing as tppp +import trufflepig.bchain.getdata as tpgd from trufflepig.testutils.random_data import create_n_random_posts -from trufflepig.testutils.pytest_fixtures import temp_dir +from trufflepig.testutils.pytest_fixtures import temp_dir, steem_kwargs def test_load_or_preproc(temp_dir): @@ -22,4 +23,29 @@ def test_load_or_preproc(temp_dir): ncores=5, chunksize=20) assert len(os.listdir(temp_dir)) == 1 - assert_frame_equal(frame, frame2) \ No newline at end of file + assert_frame_equal(frame, frame2) + + +def test_load_or_preproc_with_real_data(steem_kwargs, temp_dir): + filename = os.path.join(temp_dir, 'pptest.gz') + + start_datetime = pd.datetime.utcnow() - pd.Timedelta(days=14) + end_datetime = start_datetime + pd.Timedelta(hours=2) + posts = tpgd.get_all_posts_between_parallel(start_datetime, + end_datetime, + steem_kwargs, + stop_after=15) + post_frame = pd.DataFrame(posts) + bots = ['okankarol', 'bidseption', 'highvote', 'oguzhangazi', 'ottoman',] + frame = tppp.load_or_preprocess(post_frame, filename, + steem_args_for_upvote=steem_kwargs, + ncores=5, chunksize=20, bots=bots) + + assert len(os.listdir(temp_dir)) == 1 + + frame2 = tppp.load_or_preprocess(post_frame, filename, + steem_args_for_upvote=steem_kwargs, + ncores=5, chunksize=20, bots=bots) + + assert len(os.listdir(temp_dir)) == 1 + assert_frame_equal(frame, frame2) diff --git a/scripts/do_cross_val.py b/scripts/do_cross_val.py index 5546ba1..69cec4d 100644 --- a/scripts/do_cross_val.py +++ b/scripts/do_cross_val.py @@ -26,7 +26,7 @@ def main(): post_frame = tpgd.load_or_scrape_training_data(steem, directory, current_datetime=current_datetime, - days=12, + days=10, offset_days=0) gc.collect() @@ -39,6 +39,7 @@ def main(): ngrams=(1, 2)) post_frame = tppp.load_or_preprocess(post_frame, crossval_filename, + steem_args_for_upvote=steem, ncores=8, chunksize=500, min_en_prob=0.9) @@ -57,7 +58,9 @@ def main(): # n_jobs=4, targets=['reward']) pipe, test_frame = tpmo.train_test_pipeline(post_frame, topic_kwargs=topic_kwargs, - regressor_kwargs=regressor_kwargs, targets=['reward', 'votes'], + regressor_kwargs=regressor_kwargs, + targets=['adjusted_reward', + 'adjusted_votes'], random_state=42) tpmo.log_pipeline_info(pipe) diff --git a/tests/bchain/posts_test.py b/tests/bchain/posts_test.py index 0ea34f6..dd7f3fe 100644 --- a/tests/bchain/posts_test.py +++ b/tests/bchain/posts_test.py @@ -71,6 +71,10 @@ def test_weekly_update(): total_votes = 99897788 total_reward = 79898973 + bid_bots_sbd = 4242 + bid_bots_steem = 12 + bid_bots_percent = 99.9998 + median_reward = 0.012 mean_reward = 6.2987347329 dollar_percent = 69.80921393 @@ -126,6 +130,9 @@ def test_weekly_update(): total_posts=total_posts, total_votes=total_votes, total_reward=total_reward, + bid_bots_sbd=bid_bots_sbd, + bid_bots_steem=bid_bots_steem, + bid_bots_percent=bid_bots_percent, median_reward=median_reward, mean_reward=mean_reward, dollar_percent=dollar_percent, diff --git a/tests/preprocessing_test.py b/tests/preprocessing_test.py index c704005..7afd1fd 100644 --- a/tests/preprocessing_test.py +++ b/tests/preprocessing_test.py @@ -29,3 +29,22 @@ def test_preprocessing_random_parallel(): min_en_prob=0.8, max_errors_per_word=0.5) assert len(filtered) > 30 + + +def test_bid_bot_correction(): + posts = create_n_random_posts(30) + post_frame = pd.DataFrame(posts) + + bought = {} + bought[('hello', 'kitty')] = ['19 STEEM'] + sample_frame = post_frame[['author', 'permalink']].sample(10) + for _, (author, permalink) in sample_frame.iterrows(): + bought[(author, permalink)] = {'aaa':'3 STEEM', 'bbb': '4 SBD'} + + post_frame = tppp.compute_bidbot_correction(post_frame, + bought) + + assert post_frame.adjusted_reward.mean() < post_frame.reward.mean() + assert all(post_frame.adjusted_reward >= 0) + assert post_frame.adjusted_votes.mean() < post_frame.votes.mean() + assert all(post_frame.adjusted_votes >= 0) diff --git a/trufflepig/bchain/checkops.py b/trufflepig/bchain/checkops.py index f405595..6fbe8c8 100644 --- a/trufflepig/bchain/checkops.py +++ b/trufflepig/bchain/checkops.py @@ -205,4 +205,4 @@ def get_parent_posts(comment_authors_and_permalinks, steem): logger.exception(('Could not work with comment {} by ' '{}').format(comment_permalink, comment_author)) - return posts \ No newline at end of file + return posts diff --git a/trufflepig/bchain/getaccountdata.py b/trufflepig/bchain/getaccountdata.py index 6676890..e1e70c8 100644 --- a/trufflepig/bchain/getaccountdata.py +++ b/trufflepig/bchain/getaccountdata.py @@ -1,14 +1,53 @@ import logging +import multiprocessing as mp +import time import pandas as pd import numpy as np - from steem.account import Account +import trufflepig.bchain.getdata as tpbg +from trufflepig.utils import progressbar + logger = logging.getLogger(__name__) +MEMO_START = 'https://steemit.com/' + +BITBOTS = list({'smartmarket', 'smartsteem', 'upme', 'randowhale', + 'minnowbooster', 'boomerang', 'booster', 'hak4life', + 'lays', 'speedvoter', 'ebargains', 'danzy', 'bumper', + 'upvotewhale', 'treeplanter', 'minnowpond', 'morwhale', + 'drotto', 'postdoctor', 'moonbot', 'tipu', 'blockgators', + 'echowhale', 'steemvote', 'byresteem', 'originalworks', 'withsmn', + 'siditech', 'alphaprime', 'hugewhale', 'steemvoter', 'hottopic', + 'resteemable', 'earthnation-bot', 'photocontests', 'friends-bot', + 'followforupvotes', 'frontrunner', 'resteembot', 'steemlike', + 'thundercurator', 'earnmoresteem', 'microbot', 'coolbot', + 'thehumanbot', 'steemthat', 'gangvote', 'refresh', 'cabbage-dealer', + 'growingpower', 'postresteem', 'mecurator', 'talhadogan', + 'okankarol', 'bidseption', 'highvote', 'oguzhangazi', 'ottoman', + 'resteemr', 'superbot', 'bestvote', 'zerotoherobot', 'red-rose', + 'jeryalex', 'oceansbot', 'fresteem', 'otobot', 'bidbot', + 'honestbot', 'upgoater', 'whalebuilder', 'postpromoter', 'pwrup', + 'spydo', 'upmewhale', 'promobot', 'puppybot', 'moneymatchgaming', + 'sneaky-ninja', 'zapzap', 'sleeplesswhale', 'estream.studios', + 'seakraken', 'canalcrypto', 'upmyvote', 'hotbot', + 'redlambo', 'slimwhale', 'singing.beauty', 'inciter', 'lovejuice', + 'steembidbot', 'bid4joy', 'mitsuko', 'pushup', 'luckyvotes', + 'discordia', 'shares', 'postdoctor', 'upboater', + 'megabot', 'dailyupvotes', 'ebargains', 'bluebot', 'upyou', + 'edensgarden', 'smartwhale', 'voterunner', 'nado.bot', + 'jerrybanfield', 'foxyd', 'onlyprofitbot', 'minnowhelper', + 'msp-bidbot', 'therising', 'bearwards', 'thebot', 'buildawhale', + 'chronocrypto', 'brupvoter', 'smartsteem', 'payforplay', + 'adriatik', 'cryptoempire', 'isotonic', 'minnowfairy', + 'appreciator', 'childfund', 'mercurybot', 'allaz', 'sunrawhale', + 'mrswhale', 'kittybot', 'lightningbolt', 'hottopic', + 'sportic'}) + + def find_nearest_index(target_datetime, account, steem, @@ -17,8 +56,6 @@ def find_nearest_index(target_datetime, index_tolerance=5): """ Finds nearest account action index to `target_datetime` - Currently NOT used in production! - Parameters ---------- target_datetime: datetime @@ -45,17 +82,17 @@ def find_nearest_index(target_datetime, current_index = latest_index best_largest_index = latest_index - action = next(acc.get_account_history(best_largest_index, limit=10)) + action = next(acc.get_account_history(best_largest_index, limit=1)) best_largest_datetime = pd.to_datetime(action['timestamp']) if target_datetime > best_largest_datetime: - logger.warning('Target beyond largest block num') + logger.debug('Target beyond largest block num') return latest_index, best_largest_datetime best_smallest_index = 1 increase = index_tolerance + 1 for _ in range(max_tries): try: - action = next(acc.get_account_history(current_index, limit=10)) + action = next(acc.get_account_history(current_index, limit=1)) current_datetime = pd.to_datetime(action['timestamp']) if increase <= index_tolerance: return current_index, current_datetime @@ -70,11 +107,15 @@ def find_nearest_index(target_datetime, current_index = best_smallest_index + increase if current_index < 0 or current_index > latest_index: - raise RuntimeError('Seriously?') + raise RuntimeError('Seriously? Error for ' + 'account {}: current_index {} ' + 'latest_index {}'.format(account, + current_index, + latest_index)) except Exception: logger.exception('Problems for index {}'.format(current_index)) - current_index -= 1 - best_smallest_index -= 1 + current_index += 1 + best_largest_index += 1 def get_delegates_and_shares(account, steem): @@ -150,3 +191,152 @@ def get_delegate_payouts(account, steem, current_datetime, return payouts + +def get_upvote_payments(account, steem, min_datetime, max_datetime, + batch_size=1000, max_time=1800): + + start = time.time() + upvote_payments = {} + + start_index, _ = find_nearest_index(max_datetime, + account, steem) + try: + transfers = history_reverse(account, steem, filter_by='transfer', + start_index=start_index, + batch_size=batch_size) + except Exception as e: + logger.exception('Could not get account data from {}'.format(account)) + transfers = [] + + for transfer in transfers: + try: + memo = transfer['memo'] + if memo.startswith(MEMO_START): + author, permalink = memo.split('/')[-2:] + if author.startswith('@'): + author = author[1:] + if (author, permalink) not in upvote_payments: + upvote_payments[(author, permalink)] = {} + trx_id = transfer['trx_id'] + amount = transfer['amount'] + upvote_payments[(author, permalink)][trx_id] = amount + + timestamp = pd.to_datetime(transfer['timestamp']) + if timestamp < min_datetime: + break + + now = time.time() + if now - start > max_time: + logger.error('Reached max time of {} seconds ' + ' will stop! Account {} from {} until {} ' + 'last timestamp {}'.format(max_time, + account, + min_datetime, + max_datetime, + timestamp)) + break + + except Exception as e: + logger.exception('Could not parse {}'.format(transfer)) + + return upvote_payments + + +def history_reverse(account, steem, start_index, filter_by=None, + batch_size=1000, raw_output=False): + """ Stream account history in reverse chronological order.""" + acc = Account(account, steem) + i = start_index + if batch_size > start_index: + batch_size = start_index + while i > 0: + if i - batch_size < 0: + batch_size = i + yield from acc.get_account_history( + index=i, + limit=batch_size, + order=-1, + filter_by=filter_by, + raw_output=raw_output, + ) + i -= (batch_size + 1) + + +def extend_upvotes_and_payments(upvote_payments, new_payments): + for author_permalink, new_upvotes in new_payments.items(): + if author_permalink not in upvote_payments: + upvote_payments[author_permalink] = {} + upvote_payments[author_permalink].update(new_upvotes) + return upvote_payments + + +def _get_upvote_payments_parrallel(accounts, steem_args, min_datetime, + max_datetime): + steem = tpbg.check_and_convert_steem(steem_args) + results = {} + for account in accounts: + result = get_upvote_payments(account, steem, min_datetime, max_datetime) + results = extend_upvotes_and_payments(results, result) + + return result + + +def get_upvote_payments_for_accounts(accounts, steem_args, min_datetime, max_datetime, + chunksize=10, ncores=20, timeout=3600): + logger.info('Querying upvote purchases between {} and ' + '{} for {} accounts'.format(min_datetime, + max_datetime, + len(accounts))) + + # do queries by day! + start_datetimes = pd.date_range(min_datetime, max_datetime).tolist() + end_datetimes = [x for x in start_datetimes[1:]] + [max_datetime] + + if ncores > 1: + chunks = [accounts[irun: irun + chunksize] + for irun in range(0, len(accounts), chunksize)] + + ctx = mp.get_context('spawn') + pool = ctx.Pool(ncores, initializer=tpbg.config_mp_logging) + + async_results = [] + for start_datetime, end_datetime in zip(start_datetimes, end_datetimes): + for idx, chunk in enumerate(chunks): + result = pool.apply_async(_get_upvote_payments_parrallel, + args=(chunk, steem_args, + start_datetime, end_datetime)) + async_results.append(result) + + pool.close() + + upvote_payments = {} + for kdx, async in enumerate(async_results): + try: + payments = async.get(timeout=timeout) + upvote_payments = extend_upvotes_and_payments(upvote_payments, + payments) + if progressbar(kdx, len(async_results), percentage_step=5, logger=logger): + logger.info('Finished chunk {} ' + 'out of {} found so far {} ' + 'upvote buyers...'.format(kdx + 1, len(async_results), len(upvote_payments))) + except TimeoutError: + logger.exception('Something went totally wrong dude!') + + pool.join() + else: + return _get_upvote_payments_parrallel(accounts, steem_args, min_datetime, + max_datetime) + + logger.info('Found {} upvote bought articles'.format(len(upvote_payments))) + return upvote_payments + + +def get_upvote_payments_to_bots(steem_args, min_datetime, max_datetime, + bots=BITBOTS, ncores=30): + logger.info('Getting payments to following bots {}'.format(bots)) + return get_upvote_payments_for_accounts(accounts=bots, + steem_args=steem_args, + min_datetime=min_datetime, + max_datetime=max_datetime, + ncores=ncores, + chunksize=1) diff --git a/trufflepig/bchain/paydelegates.py b/trufflepig/bchain/paydelegates.py index 5876b34..8180e00 100644 --- a/trufflepig/bchain/paydelegates.py +++ b/trufflepig/bchain/paydelegates.py @@ -42,7 +42,7 @@ def pay_delegates(account, steem_args, current_datetime=current_datetime, min_days=min_days, investor_share=investor_share) - logger.info('Count the following payouts:\n{}'.format(payouts)) + logger.info('Found the following payouts:\n{}'.format(payouts)) claim_all_reward_balance(steem, account) for delegator, payout in payouts.items(): try: diff --git a/trufflepig/bchain/posts.py b/trufflepig/bchain/posts.py index 189d976..8ccb35c 100644 --- a/trufflepig/bchain/posts.py +++ b/trufflepig/bchain/posts.py @@ -250,6 +250,9 @@ def weekly_update(current_datetime, total_posts, total_votes, total_reward, + bid_bots_sbd, + bid_bots_steem, + bid_bots_percent, median_reward, mean_reward, dollar_percent, @@ -313,7 +316,7 @@ def weekly_update(current_datetime, After a bit of experimentation I chose an LSA projection with 128 dimensions. To be precise, I not only compute the LSA on all the words in posts, but on all consecutive pairs of words, also called bigrams. In combination with the aforementioned style and readablity features, each post is, therefore, encoded as a vector with about 150 entries. -For training, I read all posts that were submitted to the blockchain between 7 and 17 days ago. These posts are first filtered and subsequently encoded. Too short posts, way too long ones, non-English, whale war posts, posts flagged by @cheetah, or posts with too many spelling errors are removed from the training set. This week I got a training set of {total_posts} contributions. The resulting matrix of {total_posts} by 150 entries is used as the input to a multi-output [Random Forest regressor from scikit learn](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html). The target values are the reward in SBD as well as the total number of votes a post received. +For training, I read all posts that were submitted to the blockchain between 7 and 17 days ago. These posts are first filtered and subsequently encoded. Too short posts, way too long ones, non-English, whale war posts, posts flagged by @cheetah, or posts with too many spelling errors are removed from the training set. This week I got a training set of {total_posts} contributions. The resulting matrix of {total_posts} by 150 entries is used as the input to a multi-output [Random Forest regressor from scikit learn](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html). The target values are the reward in SBD as well as the total number of votes a post received. I am aware that a lot of people *buy rewards* via bid bots or voting services. Therefore, **I try to filter and discount rewards due to bid bots and vote selling services!** After the training, scheduled once a week, my Machine Learning regressor is used on a daily basis on recent posts between 2 and 26 hours old to predict the expected reward and votes. Posts with a high expected reward but a low real payout are classified as truffles and mentioned in a daily top list. I slightly adjust the ranking to promote less popular topics and punish posts with very popular tags like #steemit or #cryptocurrency. Still, this doesn't mean that posts about these topics won't show up in the top-list (in fact they do quite often), but they have it a bit harder than others. @@ -325,7 +328,9 @@ def weekly_update(current_datetime, So this week I scraped posts with an initial publication date between **{start_date}** and **{end_date}**. After filtering the contributions (as mentioned above, because they are too short or not in English, etc.) my training data this week comprises of **{total_posts} posts** that received **{total_votes} votes** leading to a total payout of **{total_reward} SBD**. Wow, this is a lot! -How are these payouts distributed among the posts? Well, on average a post received **{mean_reward:.3f} SBD**. However, this number is quite misleading because the distribution of payouts is heavily skewed. In fact, the median payout is **only {median_reward:.3f} SBD**! Moreover, **{dollar_percent}%** of posts are paid less than 1 SBD! Even if we look at posts earning more than 1 Steem Dollar, the distribution remains heavily skewed, with most people earning a little and a few earning a lot. Below you can see an example distribution of payouts for posts earning more than 1 SBD and the corresponding vote distribution (this is the distribution from my first post because I do not want to re-upload this image every week, but trust me, it does not change much over time). +By the way, in my training data people spend **{bid_bots_sbd} SBD** and **{bid_bots_steem} STEEM** to promote their posts via **bid bots or vote selling services**. In fact, **{bid_bots_percent:.1f}% of the posts** were upvoted by these bot services. + +Let's leave the bots behind and focus more on the posts' payouts. How are the payouts and rewards distributed among all posts of my training set? Well, on average a post received **{mean_reward:.3f} SBD**. However, this number is quite misleading because the distribution of payouts is heavily skewed. In fact, the median payout is **only {median_reward:.3f} SBD**! Moreover, **{dollar_percent}% of posts are paid less than 1 SBD!** Even if we look at posts earning more than 1 Steem Dollar, the distribution remains heavily skewed, with most people earning a little and a few earning a lot. Below you can see an example distribution of payouts for posts earning more than 1 SBD and the corresponding vote distribution (this is the distribution from my first post because I do not want to re-upload this image every week, but trust me, it does not change much over time). ![earnings](https://raw.githubusercontent.com/SmokinCaterpillar/TrufflePig/feature/weekly_status/img/distribution.png) @@ -426,6 +431,9 @@ def weekly_update(current_datetime, total_posts=total_posts, total_votes=total_votes, total_reward=int(total_reward), + bid_bots_sbd=int(bid_bots_sbd), + bid_bots_steem=int(bid_bots_steem), + bid_bots_percent=bid_bots_percent, median_reward=median_reward, mean_reward=mean_reward, dollar_percent=int(dollar_percent), diff --git a/trufflepig/bchain/postweeklyupdate.py b/trufflepig/bchain/postweeklyupdate.py index 5a8e3d7..3426680 100644 --- a/trufflepig/bchain/postweeklyupdate.py +++ b/trufflepig/bchain/postweeklyupdate.py @@ -58,6 +58,12 @@ def compute_weekly_statistics(post_frame, pipeline, N=10, topics_step=4): >= min_count].sort_values('per_post', ascending=False) top_tags_earnings = top_tags_earnings.iloc[:N, :] + logger.info('Computing bid bot stats...') + num_articles = (post_frame.bought_votes > 0).sum() + bid_bots_percent = num_articles / len(post_frame) * 100 + bid_bots_steem = post_frame.steem_bought_reward.sum() + bid_bots_sbd = post_frame.sbd_bought_reward.sum() + # get top tokens logger.info('Computing top words...') token_count_dict = {} @@ -126,6 +132,9 @@ def compute_weekly_statistics(post_frame, pipeline, N=10, topics_step=4): total_posts=total_posts, total_votes=total_votes, total_reward=total_reward, + bid_bots_sbd=bid_bots_sbd, + bid_bots_steem=bid_bots_steem, + bid_bots_percent=bid_bots_percent, median_reward=median_reward, mean_reward=mean_reward, dollar_percent=dollar_percent, @@ -154,7 +163,7 @@ def compute_weekly_statistics(post_frame, pipeline, N=10, topics_step=4): def return_overview_permalink_if_exists(account, steem_args, current_datetime): steem = tppd.check_and_convert_steem(steem_args) - permalink = PERMALINK_TEMPLATE.format(date=current_datetime.strftime('%Y-%V')) + permalink = PERMALINK_TEMPLATE.format(date=current_datetime.strftime('%Y-%U')) try: Post('@{}/{}'.format(account, permalink), steem) return permalink diff --git a/trufflepig/config.py b/trufflepig/config.py index 6550244..0ec4135 100644 --- a/trufflepig/config.py +++ b/trufflepig/config.py @@ -3,8 +3,8 @@ # The steemit nodes to load data from NODE_URL = os.environ.get('STEEM_NODE_URL', 'https://api.steemit.com') NODE_URL2 = os.environ.get('STEEM_NODE_URL2', 'https://steemd.privex.io') -NODE_URL3 = os.environ.get('STEEM_NODE_URL3', None) -NODES = [x for x in (NODE_URL, NODE_URL2) if x] +NODE_URL3 = os.environ.get('STEEM_NODE_URL3', 'https://api.steem.house') +NODES = [x for x in (NODE_URL, NODE_URL2, NODE_URL3) if x] # The steemit bot account and password ACCOUNT = os.environ.get('STEEM_ACCOUNT', 'trufflepig') diff --git a/trufflepig/main.py b/trufflepig/main.py index 2a507a6..df5eb9f 100644 --- a/trufflepig/main.py +++ b/trufflepig/main.py @@ -14,6 +14,7 @@ import trufflepig.utils as tfut import trufflepig.pigonduty as tfod import trufflepig.bchain.paydelegates as tpde +import trufflepig.bchain.getaccountdata as tpad from trufflepig import config from trufflepig.utils import configure_logging import trufflepig.bchain.postweeklyupdate as tppw @@ -82,7 +83,7 @@ def load_and_preprocess_2_frames(log_directory, current_datetime, steem_kwargs, steem_kwargs=steem_kwargs, data_directory=data_directory, days=days2, - offset_days=8 + days).result() + offset_days=offset_days + days).result() post_frame = pd.concat([post_frame, post_frame2], axis=0) # We need to reset the index because due to concatenation @@ -90,6 +91,16 @@ def load_and_preprocess_2_frames(log_directory, current_datetime, steem_kwargs, post_frame.reset_index(inplace=True, drop=True) logger.info('Combining 2 frames into 1') post_frame = tppp.filter_duplicates(post_frame) + + logger.info('Searching for bid bots and bought votes') + min_datetime = post_frame.created.min() + max_datetime = post_frame.created.max() + pd.Timedelta(days=8) + upvote_payments = tpad.get_upvote_payments_to_bots(steem_args=steem_kwargs, + min_datetime=min_datetime, + max_datetime=max_datetime) + logger.info('Adjusting votes and reward') + post_frame = tppp.compute_bidbot_correction(post_frame=post_frame, + upvote_payments=upvote_payments) return post_frame @@ -150,7 +161,8 @@ def main(): pipeline = tpmo.load_or_train_pipeline(post_frame, model_directoy, current_datetime, regressor_kwargs=regressor_kwargs, - topic_kwargs=topic_kwargs) + topic_kwargs=topic_kwargs, + targets=['adjusted_reward', 'adjusted_votes']) tpmo.log_pipeline_info(pipeline=pipeline) @@ -216,6 +228,13 @@ def main(): tfut.clean_up_directory(data_directory, keep_last=25) tfut.clean_up_directory(log_directory, keep_last=14) + logger.info('Preloading -7 days for later training') + tpgd.load_or_scrape_training_data(steem_kwargs, data_directory, + current_datetime=current_datetime, + days=1, + offset_days=8, + ncores=20) + logger.info('DONE at {}'.format(current_datetime)) diff --git a/trufflepig/model.py b/trufflepig/model.py index 0747c8a..adfad27 100644 --- a/trufflepig/model.py +++ b/trufflepig/model.py @@ -487,7 +487,7 @@ def compute_log_vote_weights(target_frame): """ logger.info('Computing sample weights') - return 1 + np.log(1 + target_frame.votes) + return 1 + np.log(1 + target_frame.iloc[:, 1]) def train_pipeline(post_frame, pipeline=None, diff --git a/trufflepig/preprocessing.py b/trufflepig/preprocessing.py index 14a4bdb..75c4d8f 100644 --- a/trufflepig/preprocessing.py +++ b/trufflepig/preprocessing.py @@ -6,9 +6,11 @@ import pandas as pd import numpy as np import scipy.stats as spst +from steem.amount import Amount import trufflepig.filters.stylemeasures as tfsm import trufflepig.filters.textfilters as tftf +import trufflepig.bchain.getaccountdata as tfga logger = logging.getLogger(__name__) @@ -21,11 +23,11 @@ # maybe I need a bot just to rate spiritual content # for simplicity let's ignore them for now, # sorry, no releigious truffles in the near future! - 'bible', 'faith', 'spiritual', 'christianity') + 'bible', 'faith', 'spiritual', 'christianity', 'steemchurch') # Stay out of the whale wars! -FILTER_AUTHORS = ('haejin',) +FILTER_AUTHORS = ('haejin', 'ew-and-patterns') def filter_duplicates(frame): @@ -389,7 +391,10 @@ def preprocess(post_df, ncores=4, chunksize=500, return post_df -def load_or_preprocess(post_frame, filename, *args, overwrite=False, store=True, +def load_or_preprocess(post_frame, filename, *args, + overwrite=False, store=True, + steem_args_for_upvote=None, + bots=tfga.BITBOTS, **kwargs): """ Tries to load a preprocessed frame if not found preprocessing starts. @@ -400,6 +405,8 @@ def load_or_preprocess(post_frame, filename, *args, overwrite=False, store=True, Filename of data to load args: *args Arguments passed to normal preprocessing + steem_args_for_upvote: dict + Steem arguments, leave None to not load corrections overwrite: bool If preprocessing should be started and overwrite existing file store: bool @@ -418,6 +425,16 @@ def load_or_preprocess(post_frame, filename, *args, overwrite=False, store=True, else: logger.info('File {} not found, will start prepocessing'.format(filename)) post_frame = preprocess(post_frame, *args, **kwargs) + if steem_args_for_upvote: + logger.info('Looking for bought upvotes!') + min_datetime = post_frame.created.min() + max_datetime = post_frame.created.max() + pd.Timedelta(days=8) + upvote_payments = tfga.get_upvote_payments_to_bots(steem_args_for_upvote, + min_datetime=min_datetime, + max_datetime=max_datetime, + bots=bots) + post_frame = compute_bidbot_correction(post_frame, + upvote_payments) if store: directory = os.path.dirname(filename) if not os.path.isdir(directory): @@ -425,3 +442,54 @@ def load_or_preprocess(post_frame, filename, *args, overwrite=False, store=True, logger.info('Storing file {} to disk'.format(filename)) post_frame.to_pickle(filename, compression='gzip') return post_frame + + +def compute_bidbot_correction(post_frame, upvote_payments, sbd_punishment_factor=1.3, + steem_punishment_factor=1.2): + post_frame['sbd_bought_reward'] = 0. + post_frame['steem_bought_reward'] = 0. + post_frame['bought_votes'] = 0 + + post_frame.set_index(['author', 'permalink'], inplace=True) + + for (author, permalink), payments in upvote_payments.items(): + if (author, permalink) in post_frame.index: + sbd = 0 + steem = 0 + votes = 0 + for payment in payments.values(): + amount = Amount(payment) + value = amount.amount + asset = amount.asset + votes += 1 + if asset == 'SBD': + sbd += value + elif asset == 'STEEM': + steem += value + else: + raise RuntimeError('W00t?') + post_frame.loc[(author, permalink), + ['sbd_bought_reward', + 'steem_bought_reward', + 'bought_votes']] = sbd, steem, votes + + post_frame.reset_index(inplace=True) + post_frame['adjusted_reward'] = post_frame.reward - \ + post_frame.sbd_bought_reward * sbd_punishment_factor - \ + post_frame.steem_bought_reward * steem_punishment_factor + post_frame.loc[post_frame.adjusted_reward < 0, 'adjusted_reward'] = 0 + post_frame['adjusted_votes'] = post_frame.votes - post_frame.bought_votes + post_frame.loc[post_frame.adjusted_votes < 0, 'adjusted_votes'] = 0 + + num_articles = (post_frame.bought_votes > 0).sum() + percent = num_articles / len(post_frame) * 100 + total_steem = post_frame.steem_bought_reward.sum() + total_sbd = post_frame.sbd_bought_reward.sum() + total_votes = post_frame.bought_votes.sum() + logger.info('Found {} upvoted articles ({:.2f}%) with ' + 'total {:.3f} STEEM {:.3f} SBD, and {} bought votes!'.format(num_articles, + percent, + total_steem, + total_sbd, + total_votes)) + return post_frame