In [3]:
import pandas as pd
import numpy as np
import json

from pathlib import Path

from config import config
from src.preprocessing.merge_csv_in_df import merge_csv_by_date
from src.llm.loaders.gnews_loader import load_and_process_news_data
from src.llm.token_estimator import estimate_tokens_and_cost
from src.llm.langchain_news_analyzer import CryptoNewsSentimentAnalyzer
from src.llm.bart_news_analyzer import CryptoNewsAnalyzer
from src.utils.llm_utils import add_daily_aggregates
from src.models.reddit_analyzer import RedditAnalyzer
from src.processing.reddit_db_parser import get_daily_reddit_data


df_btc = pd.read_csv(
    config.DATA_DIR / "processed" / "crypto_prices" / "btc_original.csv",
    parse_dates=["date"],
    index_col="date",
)
df_eth = pd.read_csv(
    config.DATA_DIR / "processed" / "crypto_prices" / "eth_original.csv",
    parse_dates=["date"],
    index_col="date",
)

# Google Trends


we have google trends searches for the words:

- bitcoin
- blockchain
- cryptocurrency
- ethereum
- investing


In [50]:
GOOGLE_TRENDS = config.DATA_DIR / "raw" / "google_trends"

df_btc = merge_csv_by_date(
    GOOGLE_TRENDS / "bitcoin.csv",
    df_btc,
    "gTrendsBitcoin",
    is_monthly=True,
    rows_to_skip=1,
)
df_btc = merge_csv_by_date(
    GOOGLE_TRENDS / "blockchain.csv",
    df_btc,
    "gTrendsBlockchain",
    is_monthly=True,
    rows_to_skip=1,
)
df_btc = merge_csv_by_date(
    GOOGLE_TRENDS / "cryptocurrency.csv",
    df_btc,
    "gTrendsCryptocurrency",
    is_monthly=True,
    rows_to_skip=1,
)
df_btc = merge_csv_by_date(
    GOOGLE_TRENDS / "investing.csv",
    df_btc,
    "gTrendsInvesting",
    is_monthly=True,
    rows_to_skip=1,
)

df_eth = merge_csv_by_date(
    GOOGLE_TRENDS / "ethereum.csv",
    df_eth,
    "gTrendsEthereum",
    is_monthly=True,
    rows_to_skip=1,
)
df_eth = merge_csv_by_date(
    GOOGLE_TRENDS / "blockchain.csv",
    df_eth,
    "gTrendsBlockchain",
    is_monthly=True,
    rows_to_skip=1,
)
df_eth = merge_csv_by_date(
    GOOGLE_TRENDS / "cryptocurrency.csv",
    df_eth,
    "gTrendsCryptocurrency",
    is_monthly=True,
    rows_to_skip=1,
)
df_eth = merge_csv_by_date(
    GOOGLE_TRENDS / "investing.csv",
    df_eth,
    "gTrendsInvesting",
    is_monthly=True,
    rows_to_skip=1,
)

# Gold


In [51]:
df_btc = merge_csv_by_date(config.DATA_DIR / "raw" / "gold.csv", df_btc, "goldValue")
df_eth = merge_csv_by_date(config.DATA_DIR / "raw" / "gold.csv", df_eth, "goldValue")

# SP 500


In [52]:
df_btc = merge_csv_by_date(config.DATA_DIR / "raw" / "sp500.csv", df_btc, "sp500Value")
df_eth = merge_csv_by_date(config.DATA_DIR / "raw" / "sp500.csv", df_eth, "sp500Value")

In [53]:
pd.set_option("display.max_columns", None)
df_btc.head()

Unnamed: 0_level_0,open,high,low,close,volume,marketCap,logPriceChange,priceMovement,localMin_7,localMax_7,localMin_14,localMax_14,localMin_21,localMax_21,dayOfWeek_Sin,dayOfWeek_Cos,EMA_12,EMA_26,RSI_14,BB_Middle,BB_Upper,BB_Lower,OBV,AO,KAMA,PPO,PPO_Signal,PPO_Histogram,PVO,PVO_Signal,PVO_Histogram,ROC,RSI,Stoch_RSI_K,Stoch_RSI_D,Stoch_K,Stoch_D,TSI,Ultimate_Oscillator,WilliamsR,ADI,CMF,EMV,FI,MFI,NVI,VPT,BBM,BBW,DCM,DCW,KCM,KCW,UI,Aroon_down,Aroon_up,CCI,DPO,Ichimoku_A,Ichimoku_B,Ichimoku_Base,Ichimoku_Conversion,KST,MACD,MACD_Signal,MI,TRIX,Vortex_down,Vortex_up,WMA,CR,PSAR_down,PSAR_up,gTrendsBitcoin,gTrendsBlockchain,gTrendsCryptocurrency,gTrendsInvesting,goldValue,sp500Value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1
2017-01-01,963.658,1003.08,958.699,998.325,147775008.0,16050410000.0,0.035254,1,0,1,0,1,0,1,-0.781831,0.62349,927.913019,871.434686,83.084032,871.46615,1027.196258,715.736042,2922337000.0,139.503888,955.043802,9.281661,6.612502,2.669159,14.687295,17.065823,-2.378528,15.474703,83.084032,56.506121,62.292639,93.416834,94.168225,62.537845,82.395616,-2.234913,2957941000.0,0.562083,268.189478,2396025000.0,78.950576,1007.506291,147527500000.0,974.5448,0.0539,889.9595,226.241,891.830567,107.944512,82.395616,0.0,100.0,111.969673,55.459,745.59775,704.6545,881.415,932.752,21126.993631,56.478334,46.010969,27.806359,0.435426,0.523588,1.376167,958.571873,66.168065,0,1,6,16,1,37,,
2017-01-02,998.617,1031.39,996.702,1021.75,222184992.0,16429020000.0,0.023193,1,0,1,0,1,0,1,0.0,1.0,942.349478,882.569153,84.887313,883.52585,1046.419181,720.632519,3144522000.0,143.060506,965.552321,9.859751,7.261952,2.597799,16.843606,17.02138,-0.177774,10.820795,84.887313,62.800521,59.793982,95.188902,93.928779,63.913117,82.121062,-4.018458,3056633000.0,0.583475,304.980903,2797262000.0,80.534253,1007.506291,227017500000.0,983.7106,0.09394,904.1145,254.551,904.203846,110.878559,82.121062,0.0,100.0,131.923933,82.6259,746.1165,704.6545,898.401,946.907,22315.365979,59.780324,48.76484,27.991673,0.455892,0.493168,1.42638,972.880582,69.774625,0,1,6,16,1,37,,
2017-01-03,1021.6,1044.08,1021.6,1043.84,185168000.0,16786370000.0,0.021389,1,0,1,0,1,0,1,0.781831,0.62349,957.963404,894.515142,86.363588,896.6438,1066.653829,726.633771,3329690000.0,147.687241,989.282139,10.251627,7.859887,2.39174,16.391334,16.89537,-0.504037,16.134229,86.363588,68.222461,62.509701,97.882847,95.496194,65.565071,83.501748,-0.098089,3237848000.0,0.602965,318.138014,2981991000.0,81.475127,1009.668268,193285800000.0,997.7792,0.129162,910.941,266.278,917.502527,108.882506,83.501748,0.0,100.0,139.18114,108.60445,746.543,704.6545,904.746,970.489,23941.222052,63.448262,51.701525,28.176019,0.477714,0.49612,1.463899,989.391727,72.727175,0,1,6,16,1,37,1160.400024,2257.830078
2017-01-04,1044.4,1159.42,1044.4,1154.73,344945984.0,18571870000.0,0.10096,1,0,1,0,1,0,1,0.974928,-0.222521,988.235188,913.790317,91.076165,915.4759,1110.400038,720.551762,3674636000.0,167.045691,1049.890184,10.61374,8.410658,2.203083,22.662037,18.048704,4.613333,28.8498,91.076165,79.549401,70.190795,98.147193,97.07298,69.471661,85.562083,-1.441875,3554663000.0,0.673299,449.937853,8020429000.0,83.325859,1009.668268,398319500000.0,1036.4776,0.25011,969.1915,380.457,940.09562,142.543219,85.562083,0.0,100.0,214.597295,223.78505,752.1115,704.6545,962.416,1031.8375,26693.551368,74.444872,56.250194,28.82168,0.504156,0.422958,1.423311,1023.428,82.15233,0,1,6,16,1,37,1163.800049,2270.75
2017-01-05,1156.73,1191.1,910.417,1013.38,510199008.0,16300250000.0,-0.130575,0,0,0,0,0,0,0,0.433884,-0.900969,992.103621,921.16733,61.771598,926.89955,1116.585647,737.213453,3164437000.0,177.824844,1048.171858,10.54542,8.83761,1.70781,31.635758,20.766115,10.869643,11.653684,61.771598,56.209877,67.993913,81.46285,92.49763,57.008442,64.793651,-54.071487,3418777000.0,0.535435,192.256444,-3427722000.0,67.351462,1009.668268,517025500000.0,1046.405,0.214551,987.982,406.236,947.075085,235.894532,64.793651,0.0,100.0,91.704144,87.51825,754.89125,704.777,978.256,1050.7585,27561.826641,70.936291,59.187413,30.071714,0.528651,0.605755,1.089394,1027.063418,23.543196,1,0,6,16,1,37,1179.699951,2269.0


# Google News


In [54]:
df_news_btc = load_and_process_news_data(
    config.DATA_DIR / "raw" / "news_articles" / "bitcoin_articles.json"
)

df_news_eth = load_and_process_news_data(
    config.DATA_DIR / "raw" / "news_articles" / "ethereum_articles.json"
)

In [55]:
df_news_btc = df_news_btc.reset_index()
df_news_btc = df_news_btc.drop_duplicates(subset=["complete_text"], keep="first")

df_news_eth = df_news_eth.drop_duplicates(subset=["complete_text"], keep="first")
df_news_eth = df_news_eth.reset_index()

df_news_btc.head()

Unnamed: 0,date,title,subtitle,complete_text,word_count
0,2017-01-01,"A Look At Bitcoin Bubbles, When Will the Next ...",There have been four noteworthy bubbles in bit...,"A Look At Bitcoin Bubbles, When Will the Next ...",35
1,2017-01-01,"Bitcoin Price Tops $1,000 in First Day of 2017...",The price of bitcoin inched upward over the co...,"Bitcoin Price Tops $1,000 in First Day of 2017...",36
2,2017-01-01,What Will the Bitcoin Price Be in 2017?,CoinDesk's Charles Bovaird asks the experts fo...,What Will the Bitcoin Price Be in 2017? CoinDe...,25
3,2017-01-01,"Bitcoin Breaks $1,000 as Exchanges Break Volum...",Global bitcoin exchanges report record-setting...,"Bitcoin Breaks $1,000 as Exchanges Break Volum...",29
4,2017-01-01,"Bitcoin Price Tops $1,000 in First Day of 2017...",The price of bitcoin inched upward over the co...,"Bitcoin Price Tops $1,000 in First Day of 2017...",36


In [56]:
news_btc_cost = estimate_tokens_and_cost(
    df_news_btc["complete_text"].tolist(),
    cost_per_million_tokens=0.4,
    word_in_tokens=1.8,
)
news_eth_cost = estimate_tokens_and_cost(
    df_news_eth["complete_text"].tolist(),
    cost_per_million_tokens=0.4,
    word_in_tokens=1.8,
)

print("BTC News Cost")
print(f"Total Texts Processed: {news_btc_cost['total_texts']}")
print(f"Total Tokens Estimated: {news_btc_cost['total_tokens']}")
print(f"Estimated Cost: ${news_btc_cost['estimated_cost']:.2f} \n")

print("ETH News Cost")
print(f"Total Texts Processed: {news_eth_cost['total_texts']}")
print(f"Total Tokens Estimated: {news_eth_cost['total_tokens']}")
print(f"Estimated Cost: ${news_eth_cost['estimated_cost']:.2f}")

BTC News Cost
Total Texts Processed: 20741
Total Tokens Estimated: 1221579
Estimated Cost: $0.49 

ETH News Cost
Total Texts Processed: 16937
Total Tokens Estimated: 984198
Estimated Cost: $0.39


In [57]:
# analyzer_btc = CryptoNewsSentimentAnalyzer(verbose=True, output_file_path=config.DATA_DIR / "temp" / "news_sentiment_btc.csv")
# result_df_btc = analyzer_btc.analyze_articles_in_range(
#     df_news_btc, start_date="2022-04-28", end_date="2022-12-31"
# )

In [58]:
results_df_btc = pd.read_csv(
    config.DATA_DIR / "processed" / "news_articles" / "news_sentiment_btc.csv",
    parse_dates=["date"],
)
results_df_btc = add_daily_aggregates(results_df_btc).set_index("date")

df_btc = df_btc.merge(results_df_btc, left_index=True, right_index=True, how="inner")
df_btc = df_btc.rename(columns={"average_score": "gnews_average_score"})
df_btc["gnews_average_score"] = df_btc["gnews_average_score"].replace(0, np.nan)

for column in ["gnews_average_score", "goldValue", "sp500Value"]:
    rolling_mean = df_btc[column].rolling(10, min_periods=1).mean()
    df_btc[column] = df_btc[column].fillna(rolling_mean)

    # drop the first row if it is nan
    if pd.isna(df_btc[column].iloc[0]):
        df_btc.loc[df_btc.index[0], column] = df_btc[column].dropna().iloc[0]

df_btc["gnews_average_score"] = df_btc["gnews_average_score"].round(2)

In [59]:
df_btc.head()

Unnamed: 0_level_0,open,high,low,close,volume,marketCap,logPriceChange,priceMovement,localMin_7,localMax_7,localMin_14,localMax_14,localMin_21,localMax_21,dayOfWeek_Sin,dayOfWeek_Cos,EMA_12,EMA_26,RSI_14,BB_Middle,BB_Upper,BB_Lower,OBV,AO,KAMA,PPO,PPO_Signal,PPO_Histogram,PVO,PVO_Signal,PVO_Histogram,ROC,RSI,Stoch_RSI_K,Stoch_RSI_D,Stoch_K,Stoch_D,TSI,Ultimate_Oscillator,WilliamsR,ADI,CMF,EMV,FI,MFI,NVI,VPT,BBM,BBW,DCM,DCW,KCM,KCW,UI,Aroon_down,Aroon_up,CCI,DPO,Ichimoku_A,Ichimoku_B,Ichimoku_Base,Ichimoku_Conversion,KST,MACD,MACD_Signal,MI,TRIX,Vortex_down,Vortex_up,WMA,CR,PSAR_down,PSAR_up,gTrendsBitcoin,gTrendsBlockchain,gTrendsCryptocurrency,gTrendsInvesting,goldValue,sp500Value,gnews_average_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
2017-01-01,963.658,1003.08,958.699,998.325,147775008.0,16050410000.0,0.035254,1,0,1,0,1,0,1,-0.781831,0.62349,927.913019,871.434686,83.084032,871.46615,1027.196258,715.736042,2922337000.0,139.503888,955.043802,9.281661,6.612502,2.669159,14.687295,17.065823,-2.378528,15.474703,83.084032,56.506121,62.292639,93.416834,94.168225,62.537845,82.395616,-2.234913,2957941000.0,0.562083,268.189478,2396025000.0,78.950576,1007.506291,147527500000.0,974.5448,0.0539,889.9595,226.241,891.830567,107.944512,82.395616,0.0,100.0,111.969673,55.459,745.59775,704.6545,881.415,932.752,21126.993631,56.478334,46.010969,27.806359,0.435426,0.523588,1.376167,958.571873,66.168065,0,1,6,16,1,37,1160.400024,2257.830078,8.75
2017-01-02,998.617,1031.39,996.702,1021.75,222184992.0,16429020000.0,0.023193,1,0,1,0,1,0,1,0.0,1.0,942.349478,882.569153,84.887313,883.52585,1046.419181,720.632519,3144522000.0,143.060506,965.552321,9.859751,7.261952,2.597799,16.843606,17.02138,-0.177774,10.820795,84.887313,62.800521,59.793982,95.188902,93.928779,63.913117,82.121062,-4.018458,3056633000.0,0.583475,304.980903,2797262000.0,80.534253,1007.506291,227017500000.0,983.7106,0.09394,904.1145,254.551,904.203846,110.878559,82.121062,0.0,100.0,131.923933,82.6259,746.1165,704.6545,898.401,946.907,22315.365979,59.780324,48.76484,27.991673,0.455892,0.493168,1.42638,972.880582,69.774625,0,1,6,16,1,37,,,9.0
2017-01-03,1021.6,1044.08,1021.6,1043.84,185168000.0,16786370000.0,0.021389,1,0,1,0,1,0,1,0.781831,0.62349,957.963404,894.515142,86.363588,896.6438,1066.653829,726.633771,3329690000.0,147.687241,989.282139,10.251627,7.859887,2.39174,16.391334,16.89537,-0.504037,16.134229,86.363588,68.222461,62.509701,97.882847,95.496194,65.565071,83.501748,-0.098089,3237848000.0,0.602965,318.138014,2981991000.0,81.475127,1009.668268,193285800000.0,997.7792,0.129162,910.941,266.278,917.502527,108.882506,83.501748,0.0,100.0,139.18114,108.60445,746.543,704.6545,904.746,970.489,23941.222052,63.448262,51.701525,28.176019,0.477714,0.49612,1.463899,989.391727,72.727175,0,1,6,16,1,37,1160.400024,2257.830078,8.25
2017-01-04,1044.4,1159.42,1044.4,1154.73,344945984.0,18571870000.0,0.10096,1,0,1,0,1,0,1,0.974928,-0.222521,988.235188,913.790317,91.076165,915.4759,1110.400038,720.551762,3674636000.0,167.045691,1049.890184,10.61374,8.410658,2.203083,22.662037,18.048704,4.613333,28.8498,91.076165,79.549401,70.190795,98.147193,97.07298,69.471661,85.562083,-1.441875,3554663000.0,0.673299,449.937853,8020429000.0,83.325859,1009.668268,398319500000.0,1036.4776,0.25011,969.1915,380.457,940.09562,142.543219,85.562083,0.0,100.0,214.597295,223.78505,752.1115,704.6545,962.416,1031.8375,26693.551368,74.444872,56.250194,28.82168,0.504156,0.422958,1.423311,1023.428,82.15233,0,1,6,16,1,37,1163.800049,2270.75,5.0
2017-01-05,1156.73,1191.1,910.417,1013.38,510199008.0,16300250000.0,-0.130575,0,0,0,0,0,0,0,0.433884,-0.900969,992.103621,921.16733,61.771598,926.89955,1116.585647,737.213453,3164437000.0,177.824844,1048.171858,10.54542,8.83761,1.70781,31.635758,20.766115,10.869643,11.653684,61.771598,56.209877,67.993913,81.46285,92.49763,57.008442,64.793651,-54.071487,3418777000.0,0.535435,192.256444,-3427722000.0,67.351462,1009.668268,517025500000.0,1046.405,0.214551,987.982,406.236,947.075085,235.894532,64.793651,0.0,100.0,91.704144,87.51825,754.89125,704.777,978.256,1050.7585,27561.826641,70.936291,59.187413,30.071714,0.528651,0.605755,1.089394,1027.063418,23.543196,1,0,6,16,1,37,1179.699951,2269.0,7.75


In [60]:
df_btc.to_csv(config.DATA_DIR / "processed" / "crypto_prices" / "btc.csv")

In [61]:
# analyzer_eth = CryptoNewsAnalyzer(
#     device="cpu",
#     batch_size=8,
#     verbose=True,
#     output_file_path=config.DATA_DIR / "temp" / "news_sentiment_eth.csv",
# )
# start_date = "2017-01-01"
# end_date = "2022-12-31"

# result_df_eth = analyzer_eth.analyze_articles(df_news_eth, start_date, end_date)

In [62]:
results_df_eth = pd.read_csv(
    config.DATA_DIR / "processed" / "news_articles" / "news_sentiment_eth.csv",
    parse_dates=["date"],
    usecols=["date", "average_score"],
    index_col="date",
)

df_eth = df_eth.merge(results_df_eth, left_index=True, right_index=True, how="inner")
df_eth = df_eth.rename(columns={"average_score": "gnews_average_score"})

df_eth["gnews_average_score"] = df_eth["gnews_average_score"].replace(0, np.nan)

for column in ["gnews_average_score", "goldValue", "sp500Value"]:
    rolling_mean = df_eth[column].rolling(10, min_periods=1).mean()

    df_eth[column] = df_eth[column].fillna(rolling_mean)
    if pd.isna(df_eth[column].iloc[0]):
        df_eth.loc[df_eth.index[0], column] = df_eth[column].dropna().iloc[0]

df_eth["gnews_average_score"] = df_eth["gnews_average_score"].round(2)

In [63]:
df_eth.to_csv(config.DATA_DIR / "processed" / "crypto_prices" / "eth.csv")

# Reddit


In [2]:
from src.models.reddit_analyzer import RedditAnalyzer
from config import config

with RedditAnalyzer(config.DATABASE_URL) as analyzer:
    print(f"Total posts: {analyzer.get_total_posts()}")
    print(f"Total comments: {analyzer.get_total_comments()}")

Total posts: 942980
Total comments: 11161264


In [6]:
from src.processing.reddit_db_parser import get_daily_reddit_data

df_reddit_btc = get_daily_reddit_data(start_date="2017-01-01", end_date="2022-12-31", subreddit="bitcoin")
df_reddit_eth = get_daily_reddit_data(start_date="2017-01-01", end_date="2022-12-31", subreddit="ethereum")

Number of days with no posts: 68
Number of days with no comments: 1
Number of days with no posts: 163
Number of days with no comments: 15


In [8]:
df_reddit_btc.head()
df_reddit_eth.tail()

Unnamed: 0,date,postNumber,commentNumber
2186,2022-12-27,56.0,307.0
2187,2022-12-28,53.0,407.0
2188,2022-12-29,48.0,1201.0
2189,2022-12-30,42.0,217.0
2190,2022-12-31,35.0,0.0
