Skip to content

Commit

Permalink
New methods to get posts from the bchain
Browse files Browse the repository at this point in the history
  • Loading branch information
Robert Meyer committed Feb 11, 2018
1 parent 11bf0ff commit 9f53a5b
Show file tree
Hide file tree
Showing 4 changed files with 332 additions and 2 deletions.
8 changes: 8 additions & 0 deletions integration_tests/bchain/getdata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,11 @@ def test_find_offset(steem, bchain):
latest_block_num = bchain.get_current_block_num()
offset, datetime = tpbg.find_nearest_block_num(target, steem, latest_block_num)
assert 0 < offset < latest_block_num


def test_get_all_posts_between(steem):
now = pd.datetime.utcnow()
end = now
start = end - pd.Timedelta(minutes=1)
posts = tpbg.get_all_posts_between(start, end, steem)
assert posts
16 changes: 16 additions & 0 deletions scripts/get_all_posts_from_today.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import logging
import pandas as pd
from steem import Steem

from trufflepig import config

import trufflepig.bchain.getdata as tpbg

logging.basicConfig(level=logging.INFO)

steem = Steem(nodes=[config.NODE_URL])

now = pd.datetime.utcnow()
end = now
start = end - pd.Timedelta(hours=24)
posts = tpbg.get_all_posts_between(start, end, steem)
112 changes: 110 additions & 2 deletions trufflepig/bchain/getdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,21 @@

import pandas as pd
from steem.blockchain import Blockchain
from steem.post import Post
import json
from json import JSONDecodeError

from trufflepig.utils import progressbar


logger = logging.getLogger(__name__)

MIN_CHARACTERS = 1024


def steem2bchain(steem):
return Blockchain(steem)


def get_block_headers_between_offset_start(start_datetime, end_datetime,
end_offset_num, steem):
Expand Down Expand Up @@ -46,7 +57,8 @@ def get_block_headers_between_offset_start(start_datetime, end_datetime,
return headers


def find_nearest_block_num(target_datetime, steem, latest_block_num,
def find_nearest_block_num(target_datetime, steem,
latest_block_num=None,
max_tries=5000,
block_num_tolerance=5):
""" Finds nearest block number to `target_datetime`
Expand All @@ -57,6 +69,7 @@ def find_nearest_block_num(target_datetime, steem, latest_block_num,
steem: Steem object
latest_block_num: int
latest block number in bchain
leave None to get from steem directly
max_tries: int
number of maximum tries
block_num_tolerance: int
Expand All @@ -68,6 +81,9 @@ def find_nearest_block_num(target_datetime, steem, latest_block_num,
datetime: datetime of matching block
"""
if latest_block_num is None:
latest_block_num = steem2bchain(steem).get_current_block_num()

current_block_num = latest_block_num
best_largest_block_num = latest_block_num

Expand Down Expand Up @@ -109,4 +125,96 @@ def get_block_headers_between(start_datetime, end_datetime, steem):
end_offset_num, _ = find_nearest_block_num(end_datetime, steem, latest_block_num)
return get_block_headers_between_offset_start(start_datetime, end_datetime,
steem=steem,
end_offset_num=end_offset_num)
end_offset_num=end_offset_num)


def extract_authors_and_permalinks(operations):
authors_and_permalinks = []
for operation in operations:
op = operation['op']
if op[0] == 'comment':
title = op[1]['title']
body = op[1]['body']
if title != '' and op[1]['json_metadata'] != '' and len(body) >= MIN_CHARACTERS:
try:
metadata = json.loads(op[1]['json_metadata'])
except JSONDecodeError:
logger.debug('Could not decode metadata for {}'.format(op))
continue
try:
tags = metadata['tags']
except KeyError as e:
logger.debug('No tags for for {}'.format(op))
continue
except TypeError as e:
logger.debug('Type Error for for {}'.format(op))
continue
try:
_ = tags[0]
except IndexError as e:
logger.debug('Tags empty for {}'.format(op))
continue
author = op[1]['author']
permalink = op[1]['permlink']
authors_and_permalinks.append((author, permalink))
return authors_and_permalinks


def get_post_data(authors_and_permalinks, steem):
posts = []
for kdx, (author, permalink) in enumerate(authors_and_permalinks):
try:
p = Post('@{}/{}'.format(author, permalink), steem)
except Exception as e:
print(repr(e))
continue

post = {
'title': p.title,
'reward': p.reward.amount,
'votes': len(p.active_votes),
'created': p.created,
'tags': p.tags,
'body': p.body,
'author': author,
'permalink': permalink
}
posts.append(post)
return posts


def get_all_posts_from_block(block_num, steem):
operations = steem.get_ops_in_block(block_num, False)
if operations:
authors_and_permalinks = extract_authors_and_permalinks(operations)
if authors_and_permalinks:
return get_post_data(authors_and_permalinks, steem)
else:
logger.debug('Could not find any posts for block {}'.format(block_num))
else:
logger.warning('Could not find any operations for block {}'.format(block_num))
return []


def get_all_posts_between(start_datetime, end_datetime, steem):
start_num, _ = find_nearest_block_num(start_datetime, steem)
end_num, _ = find_nearest_block_num(end_datetime, steem)

total = end_num - start_num
posts = []
logger.info('Querying all posts between '
'{} (block {}) and {} (block {})'.format(start_datetime,
start_num,
end_datetime,
end_num))
for idx, block_num in enumerate(range(start_num, end_num+1)):
posts_in_block = get_all_posts_from_block(block_num, steem)
posts.extend(posts_in_block)
# logger.info('Finsihsed block {} '
# '(last is {}) found so far {} '
# 'posts'.format(block_num, end_num, len(posts)))
progressbar(idx, total, percentage_step=1, logger=logger)

logger.info('Scraped {} posts'.format(len(posts)))
return posts

198 changes: 198 additions & 0 deletions trufflepig/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import datetime
import logging

import numpy as np



class _Progressbar(object):
"""Implements a progress bar.
This class is supposed to be a singleton. Do not
import the class itself but use the `progressbar` function from this module.
"""
def __init__(self):
self._start_time = None # Time of start/reset
self._start_index = None # Index of start/reset
self._current_index = np.inf # Current index
self._percentage_step = None # Percentage step for bar update
self._total = None # Total steps of the bas (float) not to be mistaken for length
self._total_minus_one = None # (int) the above minus 1
self._length = None # Length of the percentage bar in `=` signs
self._norm_factor = None # Normalization factor
self._current_interval = None # The current interval,
# to check if bar needs to be updated

def _reset(self, index, total, percentage_step, length):
"""Resets to the progressbar to start a new one"""
self._start_time = datetime.datetime.now()
self._start_index = index
self._current_index = index
self._percentage_step = percentage_step
self._total = float(total)
self._total_minus_one = total - 1
self._length = length
self._norm_factor = total * percentage_step / 100.0
self._current_interval = int((index + 1.0) / self._norm_factor)

def _get_remaining(self, index):
"""Calculates remaining time as a string"""
try:
current_time = datetime.datetime.now()
time_delta = current_time - self._start_time
try:
total_seconds = time_delta.total_seconds()
except AttributeError:
# for backwards-compatibility
# Python 2.6 does not support `total_seconds`
total_seconds = ((time_delta.microseconds +
(time_delta.seconds +
time_delta.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6)
remaining_seconds = int((self._total - self._start_index - 1.0) *
total_seconds / float(index - self._start_index) -
total_seconds)
remaining_delta = datetime.timedelta(seconds=remaining_seconds)
remaining_str = ', remaining: ' + str(remaining_delta)
except ZeroDivisionError:
remaining_str = ''
return remaining_str

def __call__(self, index, total, percentage_step=5, logger='print', log_level=logging.INFO,
reprint=False, time=True, length=20, fmt_string=None, reset=False):
"""Plots a progress bar to the given `logger` for large for loops.
To be used inside a for-loop at the end of the loop.
:param index: Current index of for-loop
:param total: Total size of for-loop
:param percentage_step: Percentage step with which the bar should be updated
:param logger:
Logger to write to, if string 'print' is given, the print statement is
used. Use None if you don't want to print or log the progressbar statement.
:param log_level: Log level with which to log.
:param reprint:
If no new line should be plotted but carriage return (works only for printing)
:param time: If the remaining time should be calculated and displayed
:param length: Length of the bar in `=` signs.
:param fmt_string:
A string which contains exactly one `%s` in order to incorporate the progressbar.
If such a string is given, ``fmt_string % progressbar`` is printed/logged.
:param reset:
If the progressbar should be restarted. If progressbar is called with a lower
index than the one before, the progressbar is automatically restarted.
:return:
The progressbar string or None if the string has not been updated.
"""
reset = (reset or
index <= self._current_index or
total != self._total)
if reset:
self._reset(index, total, percentage_step, length)

statement = None
indexp1 = index + 1.0
next_interval = int(indexp1 / self._norm_factor)
ending = index >= self._total_minus_one

if next_interval > self._current_interval or ending or reset:
if time:
remaining_str = self._get_remaining(index)
else:
remaining_str = ''

if ending:
statement = '[' + '=' * self._length +']100.0%'
else:
bars = int((indexp1 / self._total) * self._length)
spaces = self._length - bars
percentage = indexp1 / self._total * 100.0
if reset:
statement = ('[' + '=' * bars +
' ' * spaces + ']' + ' %4.1f' % percentage + '%')
else:
statement = ('[' + '=' * bars +
' ' * spaces + ']' + ' %4.1f' % percentage + '%' +
remaining_str)

if fmt_string:
statement = fmt_string % statement
if logger == 'print':
if reprint:
print('\r' + statement, end='', flush=True)
else:
print(statement)
elif logger is not None:
if isinstance(logger, str):
logger = logging.getLogger(logger)
logger.log(msg=statement, level=log_level)

self._current_interval = next_interval
self._current_index = index

return statement


_progressbar = _Progressbar()


def progressbar(index, total, percentage_step=10, logger='print', log_level=logging.INFO,
reprint=True, time=True, length=20, fmt_string=None, reset=False):
"""Plots a progress bar to the given `logger` for large for loops.
To be used inside a for-loop at the end of the loop:
.. code-block:: python
for irun in range(42):
my_costly_job() # Your expensive function
progressbar(index=irun, total=42, reprint=True) # shows a growing progressbar
There is no initialisation of the progressbar necessary before the for-loop.
The progressbar will be reset automatically if used in another for-loop.
:param index: Current index of for-loop
:param total: Total size of for-loop
:param percentage_step: Steps with which the bar should be plotted
:param logger:
Logger to write to - with level INFO. If string 'print' is given, the print statement is
used. Use ``None`` if you don't want to print or log the progressbar statement.
:param log_level: Log level with which to log.
:param reprint:
If no new line should be plotted but carriage return (works only for printing)
:param time: If the remaining time should be estimated and displayed
:param length: Length of the bar in `=` signs.
:param fmt_string:
A string which contains exactly one `%s` in order to incorporate the progressbar.
If such a string is given, ``fmt_string % progressbar`` is printed/logged.
:param reset:
If the progressbar should be restarted. If progressbar is called with a lower
index than the one before, the progressbar is automatically restarted.
:return:
The progressbar string or `None` if the string has not been updated.
"""
return _progressbar(index=index, total=total, percentage_step=percentage_step,
logger=logger, log_level=log_level, reprint=reprint,
time=time, length=length, fmt_string=fmt_string, reset=reset)

0 comments on commit 9f53a5b

Please sign in to comment.