Skip to content

Commit

Permalink
now dropping old posts as well
Browse files Browse the repository at this point in the history
  • Loading branch information
Robert Meyer committed Apr 16, 2018
1 parent a0ee3b6 commit 4824ed7
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
8 changes: 8 additions & 0 deletions trufflepig/bchain/getdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@ def load_or_scrape_training_data(steem, directory,
current_datetime = pd.to_datetime(current_datetime)

start_datetime = current_datetime - pd.Timedelta(days=days + offset_days)
end_datetime = current_datetime - pd.Timedelta(days=offset_days)

frames = []
for day in range(days):
Expand All @@ -503,10 +504,17 @@ def load_or_scrape_training_data(steem, directory,
# the default indices are duplicates!
frame.reset_index(inplace=True, drop=True)
filter_date = start_datetime.date()

to_drop = frame.loc[frame.created < filter_date, :]
logger.info('Dropping {} posts not created in time '
'window, but before {}'.format(len(to_drop), filter_date))
frame.drop(to_drop.index, inplace=True)

to_drop = frame.loc[frame.created > end_datetime, :]
logger.info('Dropping {} posts not created in time '
'window, but after {}'.format(len(to_drop), end_datetime))
frame.drop(to_drop.index, inplace=True)

return frame


Expand Down
2 changes: 1 addition & 1 deletion trufflepig/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

FILTER_TAGS = ('mitnebcurationtrail', 'informationwar', 'truth', 'conspiracy',
'vaccines', 'contest', 'giveaway', 'deutsch', 'kr', 'kr-newbie',
'nsfw', 'sex', 'daily', 'photofeed',
'nsfw', 'sex', 'daily', 'photofeed', 'gambling',
# other weird stuff
'steemsilvergold', 'horoscope', 'guns', 'investing', 'tib',
# Somehow religious texts do not work in combination with others
Expand Down

0 comments on commit 4824ed7

Please sign in to comment.