Tweets between [06/01/2022 - 13/07/2022]

In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import tqdm

In [2]:
def read_return_file(folder_path):
    files_path = os.listdir(folder_path)
    files_path.sort()
    data_path_daily = ['{}{}'.format(folder_path,file) for file in files_path]
    data_path_daily = pd.DataFrame(data_path_daily,columns=['File Path'])
    data_path_daily['File Name'] = files_path
    return data_path_daily

In [3]:
#Daily Dataset
start_date = datetime.datetime(day=1,month=6,year=2022)
end_date = datetime.datetime(day=13,month=7,year=2022)
dataset = pd.DataFrame()

#Loading BTC-Bitinfocharts
bitinfo_data = pd.read_excel('Datasets/btc-bitinfocharts.xlsx',index_col =0)
bitinfo_data = bitinfo_data[(bitinfo_data['Date']>=start_date) & (bitinfo_data['Date']<=end_date)]
bitinfo_data.reset_index(drop=True,inplace=True)

#Loading BTC-GoogleTrends
googletrend_data = pd.read_excel('Datasets/btc-googletrends.xlsx',index_col=False)
googletrend_data = googletrend_data[(googletrend_data['date']>=start_date) & (googletrend_data['date']<=end_date)]
googletrend_data.reset_index(drop=True,inplace=True)

#Loading Sentiment Score
sentiment_score = pd.read_csv('Datasets/sentiment_scores_average.csv',engine='python',index_col=0)

#Loading Sentiment Score Inf.
sentiment_score_inf = pd.read_csv('Datasets/sentiment_scores_influencer_average.csv',engine='python',index_col=0)
sentiment_score_inf.rename(columns = {'POS':'POS_I','NEU':'NEU_I','NEG':'NEG_I'}, inplace = True)

#Loading Transactional Volume and Bitcoin price
btc_data = pd.read_excel('Datasets/btc_price_volume.xlsx',index_col=0)
btc_data = btc_data[(btc_data.index >=start_date) & (btc_data.index<=end_date)]
btc_data.reset_index(drop=True,inplace=True)
btc_data = btc_data[['Volume','Close','Uptick']]
btc_data.rename(columns={'Volume':'volume', 'Close':'price','Uptick':'uptick'},inplace=True)

#Combining Dataset
dataset['date'] = bitinfo_data['Date']
dataset['tweet_count'] = bitinfo_data['Bitcoin - Tweets']
dataset['tweet_gt'] = googletrend_data['bitcoin']
dataset[sentiment_score.columns] = sentiment_score
dataset[sentiment_score_inf.columns] = sentiment_score_inf
dataset[btc_data.columns] = btc_data


dataset.shape


(43, 12)

In [46]:
#Hourly Dataset
start_date = datetime.datetime(day=1,month=6,year=2022)
end_date = datetime.datetime(day=14,month=7,year=2022)
dataset_long = pd.DataFrame()

#Loading BTC-Bitinfocharts
bitinfo_data = pd.read_excel('Datasets/btc-bitinfocharts_long.xlsx',index_col =0)
bitinfo_data = bitinfo_data[(bitinfo_data.index>=start_date) & (bitinfo_data.index<=end_date)]
date = bitinfo_data.copy().index
bitinfo_data.reset_index(drop=True,inplace=True)

#Loading BTC-GoogleTrends
googletrend_data = pd.read_excel('Datasets/btc-googletrends_long.xlsx',index_col=False)
googletrend_data = googletrend_data[(googletrend_data['date']>=start_date) & (googletrend_data['date']<=end_date)]
googletrend_data.reset_index(drop=True,inplace=True)

#Loading Sentiment Score
sentiment_score = pd.read_csv('Datasets/sentiment_scores_average_long.csv',engine='python')

#Loading Sentiment Score Inf.
sentiment_score_inf = pd.read_csv('Datasets/sentiment_scores_influencer_average_long.csv',engine='python',index_col=0)
sentiment_score_inf.rename(columns = {'POS':'POS_I','NEU':'NEU_I','NEG':'NEG_I'}, inplace = True)

#Loading Transactional Volume and Bitcoin price
btc_data = pd.read_excel('Datasets/btc_price_volume_long.xlsx',index_col=0)
btc_data = btc_data[(btc_data.index >=start_date) & (btc_data.index<=end_date)]
btc_data.reset_index(drop=True,inplace=True)
btc_data = btc_data[['Volume','Close','Uptick']]
btc_data.rename(columns={'Volume':'volume', 'Close':'price','Uptick':'uptick'},inplace=True)

#Combining Dataset
dataset_long['date'] = date
dataset_long['tweet_count'] = bitinfo_data['Count']
dataset_long['tweet_gt'] = googletrend_data['bitcoin']
dataset_long[['POS','NEU','NEG']] = sentiment_score[['POS','NEU','NEG']]
dataset_long[sentiment_score_inf.columns] = sentiment_score_inf
dataset_long[btc_data.columns] = btc_data


dataset_long.shape


(1032, 12)

In [49]:
dataset_long

Unnamed: 0,date,tweet_count,tweet_gt,POS,NEU,NEG,POS_I,NEU_I,NEG_I,volume,price,uptick
0,2022-06-01 00:00:00,6194,69,0.193426,0.693996,0.112578,0.080970,0.823753,0.095278,0,31950.976562,0
1,2022-06-01 01:00:00,5991,74,0.247741,0.627124,0.125135,0.300600,0.557735,0.141665,0,31899.189453,1
2,2022-06-01 02:00:00,6458,71,0.207945,0.648826,0.143229,0.214851,0.606469,0.178679,36503552,31810.035156,0
3,2022-06-01 03:00:00,8221,74,0.271222,0.604784,0.123994,0.301883,0.554401,0.143717,0,31641.478516,1
4,2022-06-01 04:00:00,7626,79,0.236132,0.650888,0.112979,0.259684,0.622797,0.117519,489062400,31587.060547,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1027,2022-07-13 19:00:00,10502,88,0.177887,0.628710,0.193403,0.201823,0.631359,0.166818,0,19661.580078,0
1028,2022-07-13 20:00:00,9748,92,0.214906,0.594381,0.190713,0.220060,0.600004,0.179936,0,19659.623047,1
1029,2022-07-13 21:00:00,8324,80,0.231588,0.604233,0.164178,0.195565,0.607123,0.197312,465577984,19911.724609,0
1030,2022-07-13 22:00:00,7300,80,0.233875,0.591331,0.174794,0.204996,0.622610,0.172394,0,19845.537109,1
