In [2]:
import pandas as pd
from datetime import datetime

In [3]:
btc_news = pd.read_parquet('./data/btc_news_processed.parquet.gzip')
eth_news = pd.read_parquet('./data/eth_news_processed.parquet.gzip')
reddit_r_bitcoin = pd.read_parquet('./data/reddit_r_bitcoin_processed.parquet.gzip')
reddit_r_ethereum = pd.read_parquet('./data/reddit_r_ethereum_processed.parquet.gzip')
btc_tweets = pd.read_parquet('./data/btc_tweets_processed.parquet.gzip')
eth_tweets = pd.read_parquet('./data/eth_tweets_processed.parquet.gzip')

In [4]:
btc_targets = pd.read_parquet('./data/btc_targets.parquet.gzip')
eth_targets = pd.read_parquet('./data/eth_targets.parquet.gzip')

In [5]:
btc_binary = [1 if i > 0 else 0 for i in btc_targets.btc_price_log_difference]
btc_binary = pd.DataFrame({'target': btc_binary}, index=btc_targets.index)
btc_binary['day'] = [datetime.fromtimestamp(int(j)).day for j in btc_binary.index]
btc_binary['month'] = [datetime.fromtimestamp(int(j)).month for j in btc_binary.index]
btc_binary['year'] = [datetime.fromtimestamp(int(j)).year for j in btc_binary.index]

In [6]:
eth_binary = [1 if i > 0 else 0 for i in eth_targets.eth_price_log_difference]
eth_binary = pd.DataFrame({'target': eth_binary}, index=eth_targets.index)
eth_binary['day'] = [datetime.fromtimestamp(int(j)).day for j in eth_binary.index]
eth_binary['month'] = [datetime.fromtimestamp(int(j)).month for j in eth_binary.index]
eth_binary['year'] = [datetime.fromtimestamp(int(j)).year for j in eth_binary.index]

In [7]:
btc_news_grouped = (
    btc_news.groupby(['day', 'month', 'year'])['title']
    .apply(list)
    .reset_index()
)
eth_news_grouped = (
    eth_news.groupby(['day', 'month', 'year'])['title']
    .apply(list)
    .reset_index()
)
reddit_r_bitcoin_grouped = (
    reddit_r_bitcoin.groupby(['day', 'month', 'year'])['content']
    .apply(list)
    .reset_index()
)
reddit_r_ethereum_grouped = (
    reddit_r_ethereum.groupby(['day', 'month', 'year'])['content']
    .apply(list)
    .reset_index()
)
btc_tweets_grouped = (
    btc_tweets.groupby(['day', 'month', 'year'])['content_cleaned']
    .apply(list)
    .reset_index()
)
eth_tweets_grouped = (
    eth_tweets.groupby(['day', 'month', 'year'])['content_cleaned']
    .apply(list)
    .reset_index()
)

In [8]:
btc_news_merged = (
    pd.merge(btc_binary, btc_news_grouped, 'left', on=['day', 'month', 'year'])
    .rename(columns={"title": "text", "target": "label"})
)
eth_news_merged = (
    pd.merge(eth_binary, eth_news_grouped, 'left', on=['day', 'month', 'year'])
    .rename(columns={"title": "text", "target": "label"})
)
reddit_r_bitcoin_merged = (
    pd.merge(btc_binary, reddit_r_bitcoin_grouped, 'left', on=['day', 'month', 'year'])
    .rename(columns={"content": "text", "target": "label"})
)
reddit_r_ethereum_merged = (
    pd.merge(eth_binary, reddit_r_ethereum_grouped, 'left', on=['day', 'month', 'year'])
    .rename(columns={"content": "text", "target": "label"})
)
btc_tweets_merged = (
    pd.merge(btc_binary, btc_tweets_grouped, 'left', on=['day', 'month', 'year'])
    .rename(columns={"content_cleaned": "text", "target": "label"})
)
eth_tweets_merged = (
    pd.merge(eth_binary, eth_tweets_grouped, 'left', on=['day', 'month', 'year'])
    .rename(columns={"content_cleaned": "text", "target": "label"})
)

In [9]:
btc_news_merged.to_parquet('data_merged/btc_news_merged.parquet.gzip', compression='gzip')
eth_news_merged.to_parquet('data_merged/eth_news_merged.parquet.gzip', compression='gzip')
reddit_r_bitcoin_merged.to_parquet('data_merged/reddit_r_bitcoin_merged.parquet.gzip', compression='gzip')
reddit_r_ethereum_merged.to_parquet('data_merged/reddit_r_ethereum_merged.parquet.gzip', compression='gzip')
btc_tweets_merged.to_parquet('data_merged/btc_tweets_merged.parquet.gzip', compression='gzip')
eth_tweets_merged.to_parquet('data_merged/eth_tweets_merged.parquet.gzip', compression='gzip')