# Postprocessing NLP data

In [1]:
import numpy as np
import pytz
import pandas as pd
from datetime import datetime

In [2]:
import sys
sys.path.append('../../2_data_processing/numeric_data/')

from functions import unit_root_testing, HeskedTesting

### Load data

In [3]:
btc_news_vader = pd.read_parquet('../0_vader/btc_news_vader.parquet.gzip')
btc_tweets_vader = pd.read_parquet('../0_vader/btc_tweets_vader.parquet.gzip')
btc_reddit_vader = pd.read_parquet('../0_vader/reddit_r_bitcoin_vader.parquet.gzip')

btc_news_roberta = pd.read_parquet('../1_twitter_roberta_pretrained/btc_news_roberta_pretrained.parquet.gzip')
btc_tweets_roberta = pd.read_parquet('../1_twitter_roberta_pretrained/btc_tweets_roberta_pretrained.parquet.gzip')
btc_reddit_roberta = pd.read_parquet('../1_twitter_roberta_pretrained/reddit_r_bitcoin_roberta_pretrained.parquet.gzip')

btc_news_bart = pd.read_parquet('../2_bart_zero_shot/btc_news_bart_mnli.parquet.gzip')
btc_tweets_bart = pd.read_parquet('../2_bart_zero_shot/btc_tweets_bart_mnli.parquet.gzip')
btc_reddit_bart = pd.read_parquet('../2_bart_zero_shot/reddit_r_bitcoin_bart_mnli.parquet.gzip')

btc_news_roberta_finetuned = pd.read_parquet('../3_roberta_finetuned/first_results/btc_news_roberta_finetuned.parquet.gzip')
btc_tweets_roberta_finetuned = pd.read_parquet('../3_roberta_finetuned/first_results/btc_tweets_roberta_finetuned.parquet.gzip')
btc_reddit_roberta_finetuned = pd.read_parquet('../3_roberta_finetuned/first_results/reddit_r_bitcoin_roberta_finetuned.parquet.gzip')

In [4]:
eth_news_vader = pd.read_parquet('../0_vader/eth_news_vader.parquet.gzip')
eth_tweets_vader = pd.read_parquet('../0_vader/eth_tweets_vader.parquet.gzip')
eth_reddit_vader = pd.read_parquet('../0_vader/reddit_r_ethereum_vader.parquet.gzip')

eth_news_roberta = pd.read_parquet('../1_twitter_roberta_pretrained/eth_news_roberta_pretrained.parquet.gzip')
eth_tweets_roberta = pd.read_parquet('../1_twitter_roberta_pretrained/eth_tweets_roberta_pretrained.parquet.gzip')
eth_reddit_roberta = pd.read_parquet('../1_twitter_roberta_pretrained/reddit_r_ethereum_roberta_pretrained.parquet.gzip')

eth_news_bart = pd.read_parquet('../2_bart_zero_shot/eth_news_bart_mnli.parquet.gzip')
eth_tweets_bart = pd.read_parquet('../2_bart_zero_shot/eth_tweets_bart_mnli.parquet.gzip')
eth_reddit_bart = pd.read_parquet('../2_bart_zero_shot/reddit_r_ethereum_bart_mnli.parquet.gzip')

eth_news_roberta_finetuned = pd.read_parquet('../3_roberta_finetuned/first_results/eth_news_roberta_finetuned.parquet.gzip')
eth_tweets_roberta_finetuned = pd.read_parquet('../3_roberta_finetuned/first_results/eth_tweets_roberta_finetuned.parquet.gzip')
eth_reddit_roberta_finetuned = pd.read_parquet('../3_roberta_finetuned/first_results/reddit_r_ethereum_roberta_finetuned.parquet.gzip')

### Aggregate by day

In [5]:
nlp_dfs = [
    btc_news_vader,
    btc_tweets_vader,
    btc_reddit_vader,
    btc_news_roberta,
    btc_tweets_roberta,
    btc_reddit_roberta,
    btc_news_bart,
    btc_tweets_bart,
    btc_reddit_bart,
    btc_news_roberta_finetuned,
    btc_tweets_roberta_finetuned,
    btc_reddit_roberta_finetuned,
    eth_news_vader,
    eth_tweets_vader,
    eth_reddit_vader,
    eth_news_roberta,
    eth_tweets_roberta,
    eth_reddit_roberta,
    eth_news_bart,
    eth_tweets_bart,
    eth_reddit_bart,
    eth_news_roberta_finetuned,
    eth_tweets_roberta_finetuned,
    eth_reddit_roberta_finetuned,
]

nlp_dfs_names = ['btc_news', 'btc_tweets', 'btc_reddit'] * 4 + ['eth_news', 'eth_tweets', 'eth_reddit'] * 4

In [6]:
for i in range(len(nlp_dfs)):
    nlp_dfs[i] = nlp_dfs[i].groupby(['year', 'month', 'day']).mean().copy()
    nlp_dfs[i]['timestamp'] = [datetime(i[0], i[1], i[2], tzinfo=pytz.UTC).timestamp() for i in nlp_dfs[i].index]
    nlp_dfs[i] = nlp_dfs[i].reset_index(drop=True).set_index('timestamp').copy()
    nlp_dfs[i] = nlp_dfs[i].drop(columns=[
        'tweet_id',
        'like_count',
        'retweet_count',
        'reply_count',
        'user_favourites_count',
        'user_follower_count',
        'score',
        ], errors='ignore')
    nlp_dfs[i] = nlp_dfs[i].add_prefix(nlp_dfs_names[i] + '_')

In [7]:
nlp_data = pd.concat([*nlp_dfs], axis=1).sort_index()

In [8]:
btc_news_count = btc_news_bart.groupby(['year', 'month', 'day']).count()[['timestamp']].rename(columns={'timestamp': 'btc_news_count'})
btc_news_count['timestamp'] = [datetime(i[0], i[1], i[2], tzinfo=pytz.UTC).timestamp() for i in btc_news_count.index]
btc_news_count = btc_news_count.reset_index(drop=True).set_index('timestamp')

btc_tweet_count = btc_tweets_bart.groupby(['year', 'month', 'day']).count()[['tweet_id']].rename(columns={'tweet_id': 'btc_tweet_count'})
btc_tweet_count['timestamp'] = [datetime(i[0], i[1], i[2], tzinfo=pytz.UTC).timestamp() for i in btc_tweet_count.index]
btc_tweet_count = btc_tweet_count.reset_index(drop=True).set_index('timestamp')

btc_reddit_count = btc_reddit_bart.groupby(['year', 'month', 'day']).count()[['timestamp']].rename(columns={'timestamp': 'btc_reddit_count'})
btc_reddit_count['timestamp'] = [datetime(i[0], i[1], i[2], tzinfo=pytz.UTC).timestamp() for i in btc_reddit_count.index]
btc_reddit_count = btc_reddit_count.reset_index(drop=True).set_index('timestamp')

eth_news_count = eth_news_bart.groupby(['year', 'month', 'day']).count()[['timestamp']].rename(columns={'timestamp': 'eth_news_count'})
eth_news_count['timestamp'] = [datetime(i[0], i[1], i[2], tzinfo=pytz.UTC).timestamp() for i in eth_news_count.index]
eth_news_count = eth_news_count.reset_index(drop=True).set_index('timestamp')

eth_tweet_count = eth_tweets_bart.groupby(['year', 'month', 'day']).count()[['tweet_id']].rename(columns={'tweet_id': 'eth_tweet_count'})
eth_tweet_count['timestamp'] = [datetime(i[0], i[1], i[2], tzinfo=pytz.UTC).timestamp() for i in eth_tweet_count.index]
eth_tweet_count = eth_tweet_count.reset_index(drop=True).set_index('timestamp')

eth_reddit_count = eth_reddit_bart.groupby(['year', 'month', 'day']).count()[['timestamp']].rename(columns={'timestamp': 'eth_reddit_count'})
eth_reddit_count['timestamp'] = [datetime(i[0], i[1], i[2], tzinfo=pytz.UTC).timestamp() for i in eth_reddit_count.index]
eth_reddit_count = eth_reddit_count.reset_index(drop=True).set_index('timestamp')

count_data = pd.concat([btc_news_count,
                        btc_tweet_count,
                        btc_reddit_count,
                        eth_news_count,
                        eth_tweet_count,
                        eth_reddit_count], axis=1)


In [9]:
text_data = pd.concat([nlp_data, count_data], axis=1)

### Check stationarity

In [10]:
HeskedTesting.run_all_tests(text_data, conf=0.01, tabsize=50)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

btc_news_vader_score --                            White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_tweets_vader_score --                          White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_reddit_vader_score --                          White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_news_twitter_roberta_pretrained_score --       White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_tweets_twitter_roberta_pretrained_score --     White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_

In [11]:
unit_root_testing(text_data, conf=0.01, tabsize=50)

Results of ADF, PP and KPSS tests by column (p-values):

btc_news_vader_score --                            ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [32m0.0916[0m
btc_tweets_vader_score --                          ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [31m0.0001[0m
btc_reddit_vader_score --                          ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [31m0.0073[0m
btc_news_twitter_roberta_pretrained_score --       ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [31m0.0026[0m
btc_tweets_twitter_roberta_pretrained_score --     ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [32m0.0617[0m
btc_reddit_twitter_roberta_pretrained_score --     ADF: [32m0.0000[0m,           

### Log difference count data

In [12]:
def log_difference_dataframe(df: pd.DataFrame):
    ''' Returns dataframe where all variables differenced once. '''
    
    # create empty dataframe
    df_differenced = pd.DataFrame(index=df.index)

    # add first order log difference of all variables suffixed with '_d'
    for column in [i for i in list(df.columns)]:
        with np.errstate(divide='ignore', invalid='ignore'):
            df_differenced[column + '_d'] = np.diff(np.log(df[column] + 0.01), prepend=float('nan'))

    return df_differenced

In [13]:
count_data_differenced = log_difference_dataframe(count_data)
nlp_data_differenced = log_difference_dataframe(nlp_data)

In [14]:
text_data_stationary = pd.concat([nlp_data, count_data_differenced], axis=1)
text_data_fully_differenced = pd.concat([nlp_data_differenced, count_data_differenced], axis=1)

### Re-check stationarity

In [15]:
HeskedTesting.run_all_tests(text_data_fully_differenced, conf=0.01, tabsize=50)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

btc_news_vader_score_d --                          White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0027[0m,                   Goldfeld-Quandt: [31m0.0010[0m
btc_tweets_vader_score_d --                        White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_reddit_vader_score_d --                        White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_news_twitter_roberta_pretrained_score_d --     White: [31m0.0024[0m,                          Breusch-Pagan: [31m0.0008[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_tweets_twitter_roberta_pretrained_score_d --   White: [31m0.0000[0m,                          Breusch-Pagan: [31m0.0000[0m,                   Goldfeld-Quandt: [31m0.0000[0m
btc_

In [16]:
unit_root_testing(text_data_fully_differenced, conf=0.01, tabsize=50)

Results of ADF, PP and KPSS tests by column (p-values):

btc_news_vader_score_d --                          ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [32m0.3959[0m
btc_tweets_vader_score_d --                        ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [32m0.8614[0m
btc_reddit_vader_score_d --                        ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [32m0.2464[0m
btc_news_twitter_roberta_pretrained_score_d --     ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [32m0.0894[0m
btc_tweets_twitter_roberta_pretrained_score_d --   ADF: [32m0.0000[0m,                            PP: [32m0.0000[0m,                              KPSS: [32m0.7064[0m
btc_reddit_twitter_roberta_pretrained_score_d --   ADF: [32m0.0000[0m,           

### Save data

In [17]:
btc_raw_text_data = text_data[[col for col in text_data if col.startswith('btc')]]
eth_raw_text_data = text_data[[col for col in text_data if col.startswith('eth')]]

btc_stationary_text_data = text_data_stationary[[col for col in text_data_stationary if col.startswith('btc')]]
eth_stationary_text_data = text_data_stationary[[col for col in text_data_stationary if col.startswith('eth')]]

btc_fully_differenced_text_data = text_data_fully_differenced[[col for col in text_data_fully_differenced if col.startswith('btc')]]
eth_fully_differenced_text_data = text_data_fully_differenced[[col for col in text_data_fully_differenced if col.startswith('eth')]]

In [18]:
btc_raw_text_data.to_parquet('btc_raw_text_data.parquet.gzip', compression='gzip')
eth_raw_text_data.to_parquet('eth_raw_text_data.parquet.gzip', compression='gzip')

btc_stationary_text_data.to_parquet('btc_stationary_text_data.parquet.gzip', compression='gzip')
eth_stationary_text_data.to_parquet('eth_stationary_text_data.parquet.gzip', compression='gzip')

btc_fully_differenced_text_data.to_parquet('btc_fully_differenced_text_data.parquet.gzip', compression='gzip')
eth_fully_differenced_text_data.to_parquet('eth_fully_differenced_text_data.parquet.gzip', compression='gzip')