diff --git a/scripts/create_raw_data_fixture.py b/scripts/create_raw_data_fixture.py new file mode 100644 index 0000000..98bd22f --- /dev/null +++ b/scripts/create_raw_data_fixture.py @@ -0,0 +1,15 @@ +import os + +from trufflepig import config +import trufflepig.bchain.getdata as tpbg + + +directory = os.path.join(config.PROJECT_DIRECTORY, 'scraped_data') + + +frames = tpbg.scrape_or_load_training_data_parallel([config.NODE_URL], + directory, + days=20, + stop_after=100, + ncores=5, + current_datetime='2018-02-11') \ No newline at end of file diff --git a/trufflepig/bchain/getdata.py b/trufflepig/bchain/getdata.py index 495744d..d69d9bb 100644 --- a/trufflepig/bchain/getdata.py +++ b/trufflepig/bchain/getdata.py @@ -233,7 +233,11 @@ def scrape_or_load_full_day(date, steem, directory, overwrite=False, start_datetime = pd.to_datetime(date) end_datetime = start_datetime + pd.Timedelta(days=1) if not os.path.isdir(directory): - os.makedirs(directory) + try: + os.makedirs(directory) + except FileExistsError: + # race conditions + pass filename = FILENAME_TEMPLATE.format(year=start_datetime.year, month=start_datetime.month, day=start_datetime.day) @@ -278,6 +282,8 @@ def scrape_or_load_training_data_parallel(node_urls, directory, if current_datetime is None: current_datetime = pd.datetime.utcnow() + else: + current_datetime = pd.to_datetime(current_datetime) start_datetime = current_datetime - pd.Timedelta(days=days + offset)