Skip to content

Commit

Permalink
Merge f930b1a into 264bd20
Browse files Browse the repository at this point in the history
  • Loading branch information
SmokinCaterpillar committed Mar 24, 2018
2 parents 264bd20 + f930b1a commit 8a8b702
Show file tree
Hide file tree
Showing 12 changed files with 173 additions and 92 deletions.
14 changes: 0 additions & 14 deletions integration_tests/bchain/getdata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,3 @@ def test_scrape_recent_date(steem):
stop_after=50,
ncores=1)
assert len(frame)


def test_cheetah_exclusion(steem):
p = Post('@neuehorizonte/das-betrugmodell-unseres-'
'finanzsystem-und-der-ausweg-prof-franz-hrmann--azk-20180225t104415261z',
steem)
assert tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)


def test_not_cheetah_exclusion(steem):
p = Post('@@smcaterpillar/trufflepig-introducing-the-artificial-'
'intelligence-for-content-curation-and-minnow-support',
steem)
assert not tpbg.exclude_if_voted_by(p.active_votes, tpbg.EXCLUSION_VOTERS_SET)
20 changes: 20 additions & 0 deletions integration_tests/persist_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
import pandas as pd

from trufflepig.testutils.pytest_fixtures import temp_dir
from trufflepig.testutils.random_data import create_n_random_posts
import trufflepig.preprocessing as tppp
import trufflepig.persist as tppe


def test_store_load_frame_test(temp_dir):
filename = os.path.join(temp_dir, 'test.sqlite')

x = pd.DataFrame(create_n_random_posts(42))
x = tppp.preprocess(x)

tppe.to_sqlite(x, filename, 'test')

y = tppe.from_sqlite(filename, 'test')

pd.testing.assert_frame_equal(x,y)
4 changes: 4 additions & 0 deletions tests/filters/textfilters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ def test_is_in_tags_typerror():
assert result


def test_voted_by():
assert tptf.voted_by([{'voter': 'cheetah'}], {'cheetah'})


def test_filter_headdings():
text= """# heading nheadings
heyho
Expand Down
13 changes: 8 additions & 5 deletions tests/preprocessing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

def test_preprocessing():
post_frame = pd.DataFrame(POSTS)
filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.8,
max_errors_per_word=0.5)
filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.5,
max_errors_per_word=0.5,
min_max_num_words=(10, 99999))

assert len(filtered)

Expand All @@ -17,7 +18,8 @@ def test_preprocessing_parallel():
post_frame = pd.DataFrame([POSTS[0] for _ in range(100)])
post_frame['permalink'] = ['kkk'+str(irun % 50) for irun in range(100)]
filtered = tppp.preprocess(post_frame, ncores=5, chunksize=20,
min_en_prob=0.8, max_errors_per_word=0.5)
min_en_prob=0.5, max_errors_per_word=0.5,
min_max_num_words=(10, 99999))

assert len(filtered) > 40

Expand All @@ -26,9 +28,10 @@ def test_preprocessing_random_parallel():
posts = create_n_random_posts(50)
post_frame = pd.DataFrame(posts)
filtered = tppp.preprocess(post_frame, ncores=5, chunksize=10,
min_en_prob=0.8, max_errors_per_word=0.5)
min_en_prob=0.5, max_errors_per_word=0.5,
min_max_num_words=(10, 99999))

assert len(filtered) > 30
assert len(filtered) > 20


def test_bid_bot_correction():
Expand Down
73 changes: 15 additions & 58 deletions trufflepig/bchain/getdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,17 @@
from json import JSONDecodeError

from trufflepig.utils import progressbar, none_retry, error_retry
import trufflepig.persist as tppe


logger = logging.getLogger(__name__)


MIN_CHARACTERS = 500

EXCLUSION_VOTERS_SET = {'cheetah'}
FILENAME_TEMPLATE = 'steemit_posts__{time}.sqlite'

FILENAME_TEMPLATE = 'steemit_posts__{time}.gz'
TABLENAME = 'steemit_posts'


################################### Block Utils #################################
Expand Down Expand Up @@ -176,34 +177,13 @@ def extract_authors_and_permalinks(operations):
return authors_and_permalinks


def exclude_if_voted_by(active_votes, voted_by):
""" Checks if a post as been voted by someone like cheetah
Parameters
----------
active_votes: list of dict
List of all votes cast
voted_by: set
Set of voters that lead to exclusion, e.g. cheetah
Returns
-------
bool if post should be excluded
"""
in_exclusion = [vote['voter'] in voted_by for vote in active_votes]
return any(in_exclusion)


def get_post_data(authors_and_permalinks, steem, exclusion_voters):
def get_post_data(authors_and_permalinks, steem):
""" Queries posts from `steem`
Parameters
----------
authors_and_permalinks: set of tuples of authors and permalink strings
steem: Steem object
exclusion_voters: set
If post is voted by any of them it is excluded (cheetah list!)
Returns
-------
Expand Down Expand Up @@ -237,15 +217,14 @@ def get_post_data(authors_and_permalinks, steem, exclusion_voters):
steem.reconnect()
continue

if exclude_if_voted_by(p.active_votes, exclusion_voters):
logger.debug('Excluding {} by {} because voted by '
'{}'.format(permalink, author, exclusion_voters))
continue
# Add positive votes and subtract negative
votes = sum(1 if x['percent'] > 0 else -1 for x in p.active_votes)
votes = max(votes, 0)

post = {
'title': p.title,
'reward': p.reward.amount,
'votes': len([x for x in p.active_votes if x['percent'] > 0]),
'votes':votes,
'active_votes': p.active_votes,
'created': p.created,
'tags': p.tags,
Expand All @@ -259,16 +238,13 @@ def get_post_data(authors_and_permalinks, steem, exclusion_voters):


def get_all_posts_from_block(block_num, steem,
exclusion_voters,
exclude_authors_and_permalinks=None):
""" Gets all posts from one block
Parameters
----------
block_num: int
steem: MPSteem
exclusion_voters: set
If post is voted by any of them it is excluded (cheetah list!)
exclude_authors_and_permalinks: set of tuples of strings
Exclude these authors and permalinks to get less duplicates
Expand All @@ -284,8 +260,7 @@ def get_all_posts_from_block(block_num, steem,
if exclude_authors_and_permalinks:
authors_and_permalinks -= exclude_authors_and_permalinks
if authors_and_permalinks:
return get_post_data(authors_and_permalinks, steem,
exclusion_voters), authors_and_permalinks
return get_post_data(authors_and_permalinks, steem), authors_and_permalinks
else:
logger.debug('Could not find any posts for block {}'.format(block_num))
else:
Expand All @@ -297,7 +272,6 @@ def get_all_posts_from_block(block_num, steem,


def get_all_posts_between(start_datetime, end_datetime, steem,
exclusion_voters=EXCLUSION_VOTERS_SET,
stop_after=None):
""" Queries all posts found in blocks between start and end
Expand All @@ -306,8 +280,6 @@ def get_all_posts_between(start_datetime, end_datetime, steem,
start_datetime: datetime
end_datetime: datetime
steem: Steem
exclusion_voters: set
If post is voted by any of them it is excluded (cheetah list!)
stop_after: int or None
For debugging and shorter tests, stop after only a few iterations
Expand All @@ -330,7 +302,6 @@ def get_all_posts_between(start_datetime, end_datetime, steem,
for idx, block_num in enumerate(range(start_num, end_num+1)):
posts_in_block, authors_and_permalinks = get_all_posts_from_block(block_num,
steem,
exclusion_voters,
exclude_authors_and_permalinks)
exclude_authors_and_permalinks |= authors_and_permalinks
posts.extend(posts_in_block)
Expand All @@ -350,15 +321,14 @@ def config_mp_logging(level=logging.INFO):
logging.basicConfig(level=level)


def _get_all_posts_for_blocks_parallel(block_nums, steem, exclusion_voters,
def _get_all_posts_for_blocks_parallel(block_nums, steem,
stop_after=None):
"""Helper wrapper for multiprocessing"""
posts = []
exclude_authors_and_permalinks = set()
for block_num in block_nums:
posts_in_block, authors_and_permalinks = get_all_posts_from_block(block_num,
steem,
exclusion_voters,
exclude_authors_and_permalinks)
exclude_authors_and_permalinks |= authors_and_permalinks
posts.extend(posts_in_block)
Expand All @@ -368,7 +338,6 @@ def _get_all_posts_for_blocks_parallel(block_nums, steem, exclusion_voters,


def get_all_posts_between_parallel(start_datetime, end_datetime, steem,
exclusion_voters=EXCLUSION_VOTERS_SET,
stop_after=None, ncores=8,
chunksize=20, timeout=1200):
"""As above but in parallel with `ncores` jobs of `chunksize`.
Expand All @@ -395,7 +364,6 @@ def get_all_posts_between_parallel(start_datetime, end_datetime, steem,
for idx, chunk in enumerate(chunks):
result = pool.apply_async(_get_all_posts_for_blocks_parallel,
args=(chunk, steem,
exclusion_voters,
stop_after))
async_results.append(result)
if stop_after is not None and idx >= stop_after:
Expand All @@ -420,7 +388,6 @@ def get_all_posts_between_parallel(start_datetime, end_datetime, steem,


def load_or_scrape_full_day(date, steem, directory,
exclusion_voters=EXCLUSION_VOTERS_SET,
overwrite=False,
store=True, stop_after=None, ncores=1):
""" Loads posts of a full day or queries them from steem blockchain
Expand All @@ -432,8 +399,6 @@ def load_or_scrape_full_day(date, steem, directory,
steem: Steem object
directory: str
Directory to load posts from
exclusion_voters: set
If post is voted by any of them it is excluded (cheetah list!)
overwrite: bool
If stored posts should be replaced
store: bool
Expand All @@ -456,30 +421,30 @@ def load_or_scrape_full_day(date, steem, directory,
filename = os.path.join(directory,filename)
if os.path.isfile(filename) and not overwrite:
logger.info('Found file {} will load it'.format(filename))
post_frame = pd.read_pickle(filename, compression='gzip')
post_frame = tppe.from_sqlite(filename=filename,
tablename=TABLENAME)
else:
logger.info('File {} not found, will start scraping'.format(filename))

if ncores == 1:
posts = get_all_posts_between(start_datetime, end_datetime, steem,
exclusion_voters=exclusion_voters,
stop_after=stop_after)
else:
posts = get_all_posts_between_parallel(start_datetime, end_datetime,
steem,
exclusion_voters=exclusion_voters,
stop_after=stop_after,
ncores=ncores)

post_frame = pd.DataFrame(data=posts, columns=sorted(posts[0].keys()))
if store:
logger.info('Storing file {} to disk'.format(filename))
post_frame.to_pickle(filename, compression='gzip')
tppe.to_sqlite(post_frame,
filename=filename,
tablename=TABLENAME)
return post_frame


def load_or_scrape_training_data(steem, directory,
exclusion_voters=EXCLUSION_VOTERS_SET,
days=20, offset_days=8,
ncores=8,
current_datetime=None,
Expand All @@ -491,8 +456,6 @@ def load_or_scrape_training_data(steem, directory,
----------
steem: Steem object
directory: str
exclusion_voters: set
If post is voted by any of them it is excluded (cheetah list!)
days: int
Number of consecutive days to load or scrape
offset_days: int
Expand Down Expand Up @@ -522,7 +485,6 @@ def load_or_scrape_training_data(steem, directory,
next_date = (start_datetime + pd.Timedelta(days=day)).date()
frame = load_or_scrape_full_day(next_date, steem,
directory,
exclusion_voters=exclusion_voters,
overwrite=False,
store=store,
stop_after=stop_after,
Expand All @@ -536,7 +498,6 @@ def load_or_scrape_training_data(steem, directory,


def scrape_hour_data(steem,
exclusion_voters=EXCLUSION_VOTERS_SET,
hours=24,
offset_hours=24,
current_datetime=None,
Expand All @@ -546,8 +507,6 @@ def scrape_hour_data(steem,
Parameters
----------
steem: Steem or kwargs
exclusion_voters: set
If post is voted by any of them it is excluded (cheetah list!)
hours: int
Number of consecutive hours to scrape
offset_hours: int
Expand All @@ -574,13 +533,11 @@ def scrape_hour_data(steem,
posts = get_all_posts_between(start_datetime,
end_datetime,
steem,
exclusion_voters=exclusion_voters,
stop_after=stop_after)
else:
posts = get_all_posts_between_parallel(start_datetime,
end_datetime,
steem,
exclusion_voters=exclusion_voters,
stop_after=stop_after,
ncores=ncores)

Expand Down
10 changes: 6 additions & 4 deletions trufflepig/bchain/postoncall.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

import trufflepig.bchain.posts as tfbp
import trufflepig.bchain.getdata as tfgd
import trufflepig.filters.textfilters as tftf
from trufflepig.utils import error_retry
import trufflepig.preprocessing as tppp


logger = logging.getLogger(__name__)
Expand All @@ -18,7 +20,7 @@

def post_on_call(post_frame, account, steem, topN_link,
overview_permalink,
exclusion_set=tfgd.EXCLUSION_VOTERS_SET,
filter_voters=tppp.FILTER_VOTERS,
sleep_time=20.1):
""" Replies to users calling @trufflepig
Expand All @@ -28,7 +30,7 @@ def post_on_call(post_frame, account, steem, topN_link,
account: str
steem: Steem object
topN_link: str
exclusion_set: set of str
filter_voters: set of str
sleep_time: float
Bot can only post every 20 seconds,
should only be lowered for debugging
Expand All @@ -45,8 +47,8 @@ def post_on_call(post_frame, account, steem, topN_link,
comment.commit.no_broadcast = steem.commit.no_broadcast
# Wait a bit Steemit nodes hate comments in quick succession
time.sleep(sleep_time)
if not tfgd.exclude_if_voted_by(row.active_votes, {account}):
if row.passed and not tfgd.exclude_if_voted_by(row.active_votes, exclusion_set):
if not tftf.voted_by(row.active_votes, {account}):
if row.passed and not tftf.voted_by(row.active_votes, filter_voters):

logger.info('Voting and commenting on https://steemit.com/@{author}/{permalink}'
''.format(author=row.author, permalink=row.permalink))
Expand Down
3 changes: 3 additions & 0 deletions trufflepig/filters/textfilters.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,6 @@ def is_in_filter_tags(tags, filter_tags):
logger.exception('Could not identify tags {}, '
'will return True anyway'.format(tags))
return True

def voted_by(active_votes, voter_set):
return any(x['voter'] in voter_set for x in active_votes)

0 comments on commit 8a8b702

Please sign in to comment.