New methods to get posts from the bchain

SmokinCaterpillar · Feb 11, 2018 · 9f53a5b · 9f53a5b
1 parent 11bf0ff
commit 9f53a5b
Show file tree

Hide file tree

Showing 4 changed files with 332 additions and 2 deletions.
diff --git a/integration_tests/bchain/getdata_test.py b/integration_tests/bchain/getdata_test.py
@@ -38,3 +38,11 @@ def test_find_offset(steem, bchain):
     latest_block_num = bchain.get_current_block_num()
     offset, datetime = tpbg.find_nearest_block_num(target, steem, latest_block_num)
     assert 0 < offset <  latest_block_num
+
+
+def test_get_all_posts_between(steem):
+    now = pd.datetime.utcnow()
+    end = now
+    start = end - pd.Timedelta(minutes=1)
+    posts = tpbg.get_all_posts_between(start, end, steem)
+    assert posts
diff --git a/scripts/get_all_posts_from_today.py b/scripts/get_all_posts_from_today.py
@@ -0,0 +1,16 @@
+import logging
+import pandas as pd
+from steem import Steem
+
+from trufflepig import config
+
+import trufflepig.bchain.getdata as tpbg
+
+logging.basicConfig(level=logging.INFO)
+
+steem = Steem(nodes=[config.NODE_URL])
+
+now = pd.datetime.utcnow()
+end = now
+start = end - pd.Timedelta(hours=24)
+posts = tpbg.get_all_posts_between(start, end, steem)
diff --git a/trufflepig/bchain/getdata.py b/trufflepig/bchain/getdata.py
@@ -3,10 +3,21 @@
 
 import pandas as pd
 from steem.blockchain import Blockchain
+from steem.post import Post
+import json
+from json import JSONDecodeError
+
+from trufflepig.utils import progressbar
 
 
 logger = logging.getLogger(__name__)
 
+MIN_CHARACTERS = 1024
+
+
+def steem2bchain(steem):
+    return Blockchain(steem)
+
 
 def get_block_headers_between_offset_start(start_datetime, end_datetime,
                                            end_offset_num, steem):
@@ -46,7 +57,8 @@ def get_block_headers_between_offset_start(start_datetime, end_datetime,
     return headers
 
 
-def find_nearest_block_num(target_datetime, steem, latest_block_num,
+def find_nearest_block_num(target_datetime, steem,
+                           latest_block_num=None,
                            max_tries=5000,
                            block_num_tolerance=5):
     """ Finds nearest block number to `target_datetime`
@@ -57,6 +69,7 @@ def find_nearest_block_num(target_datetime, steem, latest_block_num,
     steem: Steem object
     latest_block_num: int
         latest block number in bchain
+        leave None to get from steem directly
     max_tries: int
         number of maximum tries
     block_num_tolerance: int
@@ -68,6 +81,9 @@ def find_nearest_block_num(target_datetime, steem, latest_block_num,
     datetime: datetime of matching block
 
     """
+    if latest_block_num is None:
+        latest_block_num = steem2bchain(steem).get_current_block_num()
+
     current_block_num = latest_block_num
     best_largest_block_num = latest_block_num
 
@@ -109,4 +125,96 @@ def get_block_headers_between(start_datetime, end_datetime, steem):
     end_offset_num, _ = find_nearest_block_num(end_datetime, steem, latest_block_num)
     return get_block_headers_between_offset_start(start_datetime, end_datetime,
                                                   steem=steem,
-                                                  end_offset_num=end_offset_num)
+                                                  end_offset_num=end_offset_num)
+
+
+def extract_authors_and_permalinks(operations):
+    authors_and_permalinks = []
+    for operation in operations:
+        op = operation['op']
+        if op[0] == 'comment':
+            title = op[1]['title']
+            body = op[1]['body']
+            if title != '' and op[1]['json_metadata'] != '' and len(body) >= MIN_CHARACTERS:
+                try:
+                    metadata = json.loads(op[1]['json_metadata'])
+                except JSONDecodeError:
+                    logger.debug('Could not decode metadata for {}'.format(op))
+                    continue
+                try:
+                    tags = metadata['tags']
+                except KeyError as e:
+                    logger.debug('No tags for for {}'.format(op))
+                    continue
+                except TypeError as e:
+                    logger.debug('Type Error for for {}'.format(op))
+                    continue
+                try:
+                    _ = tags[0]
+                except IndexError as e:
+                    logger.debug('Tags empty for {}'.format(op))
+                    continue
+                author = op[1]['author']
+                permalink = op[1]['permlink']
+                authors_and_permalinks.append((author, permalink))
+    return authors_and_permalinks
+
+
+def get_post_data(authors_and_permalinks, steem):
+    posts = []
+    for kdx, (author, permalink) in enumerate(authors_and_permalinks):
+        try:
+            p = Post('@{}/{}'.format(author, permalink), steem)
+        except Exception as e:
+            print(repr(e))
+            continue
+
+        post = {
+            'title': p.title,
+            'reward': p.reward.amount,
+            'votes': len(p.active_votes),
+            'created': p.created,
+            'tags': p.tags,
+            'body': p.body,
+            'author': author,
+            'permalink': permalink
+        }
+        posts.append(post)
+    return posts
+
+
+def get_all_posts_from_block(block_num, steem):
+    operations = steem.get_ops_in_block(block_num, False)
+    if operations:
+        authors_and_permalinks = extract_authors_and_permalinks(operations)
+        if authors_and_permalinks:
+            return get_post_data(authors_and_permalinks, steem)
+        else:
+            logger.debug('Could not find any posts for block {}'.format(block_num))
+    else:
+        logger.warning('Could not find any operations for block {}'.format(block_num))
+    return []
+
+
+def get_all_posts_between(start_datetime, end_datetime, steem):
+    start_num, _ = find_nearest_block_num(start_datetime, steem)
+    end_num, _ = find_nearest_block_num(end_datetime, steem)
+
+    total = end_num - start_num
+    posts = []
+    logger.info('Querying all posts between '
+                '{} (block {}) and {} (block {})'.format(start_datetime,
+                                                         start_num,
+                                                         end_datetime,
+                                                         end_num))
+    for idx, block_num in enumerate(range(start_num, end_num+1)):
+        posts_in_block = get_all_posts_from_block(block_num, steem)
+        posts.extend(posts_in_block)
+        # logger.info('Finsihsed block {} '
+        #             '(last is {}) found so far {} '
+        #             'posts'.format(block_num, end_num, len(posts)))
+        progressbar(idx, total, percentage_step=1, logger=logger)
+
+    logger.info('Scraped {} posts'.format(len(posts)))
+    return posts
+
diff --git a/trufflepig/utils.py b/trufflepig/utils.py
@@ -0,0 +1,198 @@
+import datetime
+import logging
+
+import numpy as np
+
+
+
+class _Progressbar(object):
+    """Implements a progress bar.
+
+    This class is supposed to be a singleton. Do not
+    import the class itself but use the `progressbar` function from this module.
+
+    """
+    def __init__(self):
+        self._start_time = None   # Time of start/reset
+        self._start_index = None  # Index of start/reset
+        self._current_index = np.inf  # Current index
+        self._percentage_step = None  # Percentage step for bar update
+        self._total = None  # Total steps of the bas (float) not to be mistaken for length
+        self._total_minus_one = None  # (int) the above minus 1
+        self._length = None  # Length of the percentage bar in `=` signs
+        self._norm_factor = None  # Normalization factor
+        self._current_interval = None  # The current interval,
+        # to check if bar needs to be updated
+
+    def _reset(self, index, total, percentage_step, length):
+        """Resets to the progressbar to start a new one"""
+        self._start_time = datetime.datetime.now()
+        self._start_index = index
+        self._current_index = index
+        self._percentage_step = percentage_step
+        self._total = float(total)
+        self._total_minus_one = total - 1
+        self._length = length
+        self._norm_factor = total * percentage_step / 100.0
+        self._current_interval = int((index + 1.0) / self._norm_factor)
+
+    def _get_remaining(self, index):
+        """Calculates remaining time as a string"""
+        try:
+            current_time = datetime.datetime.now()
+            time_delta = current_time - self._start_time
+            try:
+                total_seconds = time_delta.total_seconds()
+            except AttributeError:
+                # for backwards-compatibility
+                # Python 2.6 does not support `total_seconds`
+                total_seconds = ((time_delta.microseconds +
+                                    (time_delta.seconds +
+                                     time_delta.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6)
+            remaining_seconds = int((self._total - self._start_index - 1.0) *
+                                    total_seconds / float(index - self._start_index) -
+                                    total_seconds)
+            remaining_delta = datetime.timedelta(seconds=remaining_seconds)
+            remaining_str = ', remaining: ' + str(remaining_delta)
+        except ZeroDivisionError:
+            remaining_str = ''
+        return remaining_str
+
+    def __call__(self, index, total, percentage_step=5, logger='print', log_level=logging.INFO,
+                 reprint=False, time=True, length=20, fmt_string=None,  reset=False):
+        """Plots a progress bar to the given `logger` for large for loops.
+
+        To be used inside a for-loop at the end of the loop.
+
+        :param index: Current index of for-loop
+        :param total: Total size of for-loop
+        :param percentage_step: Percentage step with which the bar should be updated
+        :param logger:
+
+            Logger to write to, if string 'print' is given, the print statement is
+            used. Use None if you don't want to print or log the progressbar statement.
+
+        :param log_level: Log level with which to log.
+        :param reprint:
+
+            If no new line should be plotted but carriage return (works only for printing)
+
+        :param time: If the remaining time should be calculated and displayed
+        :param length: Length of the bar in `=` signs.
+        :param fmt_string:
+
+            A string which contains exactly one `%s` in order to incorporate the progressbar.
+            If such a string is given, ``fmt_string % progressbar`` is printed/logged.
+
+        :param reset:
+
+            If the progressbar should be restarted. If progressbar is called with a lower
+            index than the one before, the progressbar is automatically restarted.
+
+        :return:
+
+            The progressbar string or None if the string has not been updated.
+
+
+        """
+        reset = (reset or
+                 index <= self._current_index or
+                 total != self._total)
+        if reset:
+            self._reset(index, total, percentage_step, length)
+
+        statement = None
+        indexp1 = index + 1.0
+        next_interval = int(indexp1 / self._norm_factor)
+        ending = index >= self._total_minus_one
+
+        if next_interval > self._current_interval or ending or reset:
+            if time:
+                remaining_str = self._get_remaining(index)
+            else:
+                remaining_str = ''
+
+            if ending:
+                statement = '[' + '=' * self._length +']100.0%'
+            else:
+                bars = int((indexp1 / self._total) * self._length)
+                spaces = self._length - bars
+                percentage = indexp1 / self._total * 100.0
+                if reset:
+                    statement = ('[' + '=' * bars +
+                                 ' ' * spaces + ']' + ' %4.1f' % percentage + '%')
+                else:
+                    statement = ('[' + '=' * bars +
+                                 ' ' * spaces + ']' + ' %4.1f' % percentage + '%' +
+                                 remaining_str)
+
+            if fmt_string:
+                statement = fmt_string % statement
+            if logger == 'print':
+                if reprint:
+                    print('\r' + statement, end='', flush=True)
+                else:
+                    print(statement)
+            elif logger is not None:
+                if isinstance(logger, str):
+                    logger = logging.getLogger(logger)
+                logger.log(msg=statement, level=log_level)
+
+        self._current_interval = next_interval
+        self._current_index = index
+
+        return statement
+
+
+_progressbar = _Progressbar()
+
+
+def progressbar(index, total, percentage_step=10, logger='print', log_level=logging.INFO,
+                 reprint=True, time=True, length=20, fmt_string=None, reset=False):
+    """Plots a progress bar to the given `logger` for large for loops.
+
+    To be used inside a for-loop at the end of the loop:
+
+    .. code-block:: python
+
+        for irun in range(42):
+            my_costly_job() # Your expensive function
+            progressbar(index=irun, total=42, reprint=True) # shows a growing progressbar
+
+
+    There is no initialisation of the progressbar necessary before the for-loop.
+    The progressbar will be reset automatically if used in another for-loop.
+
+    :param index: Current index of for-loop
+    :param total: Total size of for-loop
+    :param percentage_step: Steps with which the bar should be plotted
+    :param logger:
+
+        Logger to write to - with level INFO. If string 'print' is given, the print statement is
+        used. Use ``None`` if you don't want to print or log the progressbar statement.
+
+    :param log_level: Log level with which to log.
+    :param reprint:
+
+        If no new line should be plotted but carriage return (works only for printing)
+
+    :param time: If the remaining time should be estimated and displayed
+    :param length: Length of the bar in `=` signs.
+    :param fmt_string:
+
+        A string which contains exactly one `%s` in order to incorporate the progressbar.
+        If such a string is given, ``fmt_string % progressbar`` is printed/logged.
+
+    :param reset:
+
+        If the progressbar should be restarted. If progressbar is called with a lower
+        index than the one before, the progressbar is automatically restarted.
+
+    :return:
+
+        The progressbar string or `None` if the string has not been updated.
+
+    """
+    return _progressbar(index=index, total=total, percentage_step=percentage_step,
+                        logger=logger, log_level=log_level, reprint=reprint,
+                        time=time, length=length, fmt_string=fmt_string, reset=reset)