Skip to content

Commit

Permalink
added scraping of days
Browse files Browse the repository at this point in the history
  • Loading branch information
Robert Meyer committed Feb 11, 2018
1 parent 6056ed6 commit c2a024a
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 5 deletions.
17 changes: 15 additions & 2 deletions integration_tests/bchain/getdata_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import pytest
import os
import shutil
import tempfile

import pandas as pd
from steem import Steem
Expand Down Expand Up @@ -43,6 +46,16 @@ def test_find_offset(steem, bchain):
def test_get_all_posts_between(steem):
now = pd.datetime.utcnow()
end = now
start = end - pd.Timedelta(minutes=1)
posts = tpbg.get_all_posts_between(start, end, steem)
start = end - pd.Timedelta(minutes=10)
posts = tpbg.get_all_posts_between(start, end, steem, stop_after=25)
assert posts


def test_scrape_date(steem):
yesterday = (pd.datetime.utcnow() - pd.Timedelta(days=1)).date()

directory = tempfile.mkdtemp()
tpbg.scrape_or_load_full_day(yesterday, steem, directory, stop_after=25)

assert len(os.listdir(directory))>0
shutil.rmtree(directory)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
pytest
numpy
pandas
pandas==0.22.0
python-coveralls
pytest-cov
langdetect==1.0.7
Expand Down
4 changes: 4 additions & 0 deletions trufflepig/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@


__version__ = '0.1.0a'

30 changes: 29 additions & 1 deletion trufflepig/bchain/getdata.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from collections import OrderedDict

import pandas as pd
Expand All @@ -14,6 +15,8 @@

MIN_CHARACTERS = 1024

FILENAME_TEMPLATE = 'steemit_posts__{year:04d}-{month:02d}-{day:02d}.pkl'


def steem2bchain(steem):
return Blockchain(steem)
Expand Down Expand Up @@ -196,7 +199,8 @@ def get_all_posts_from_block(block_num, steem):
return []


def get_all_posts_between(start_datetime, end_datetime, steem):
def get_all_posts_between(start_datetime, end_datetime, steem,
stop_after=None):
start_num, _ = find_nearest_block_num(start_datetime, steem)
end_num, _ = find_nearest_block_num(end_datetime, steem)

Expand All @@ -214,7 +218,31 @@ def get_all_posts_between(start_datetime, end_datetime, steem):
logger.info('Finished block {} '
'(last is {}) found so far {} '
'posts...'.format(block_num, end_num, len(posts)))
if stop_after is not None and len(posts) >= stop_after:
break

logger.info('Scraped {} posts'.format(len(posts)))
return posts


def scrape_or_load_full_day(date, steem, directory, overwrite=False,
store=True,
stop_after=None):
start_datetime = pd.to_datetime(date)
end_datetime = start_datetime + pd.Timedelta(days=1)
if not os.path.isdir(directory):
os.makedirs(directory)
filename = FILENAME_TEMPLATE.format(year=start_datetime.year,
month=start_datetime.month,
day=start_datetime.day)
filename = os.path.join(directory,filename)
if os.path.isfile(filename) and not overwrite:
logger.info('Found file {} will load it'.format(filename))
post_frame = pd.read_pickle(filename)
else:
posts = get_all_posts_between(start_datetime, end_datetime, steem,
stop_after=stop_after)
post_frame = pd.DataFrame(data=posts, columns=sorted(posts[0].keys()))
if store:
post_frame.to_pickle(filename, compression='gzip')
return post_frame
4 changes: 3 additions & 1 deletion trufflepig/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

NODE_URL = os.environ.get('NODE_URL', 'https://api.steemit.com')
NODE_URL = os.environ.get('NODE_URL', 'https://api.steemit.com')

PROJECT_DIRECTORY = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))

0 comments on commit c2a024a

Please sign in to comment.