In [1]:
import sys
import hopsworks
import os
sys.path.append("../..")

import utils.reddit_scraper as utils
import pandas as pd
from datetime import datetime

Loading CryptoBERT...


# Configuration

In [None]:
from datetime import datetime, timedelta
import time

SUBREDDITS = {
    'solana': 'SOL',
}

START_DATE = '2020-01-01'
END_DATE = '2025-12-31'

def get_weekly_ranges(start, end):
    """Generate monthly date ranges"""
    ranges = []
    current = datetime.strptime(start, '%Y-%m-%d')
    end_dt = datetime.strptime(end, '%Y-%m-%d')
    
    while current < end_dt:
        next_week = current + timedelta(days=1)

        if next_week > end_dt:
            next_week = end_dt
        
        ranges.append((
            current.strftime('%Y-%m-%d'),
            next_week.strftime('%Y-%m-%d')
        ))
        
        current = next_week
    
    return ranges

# Generate date ranges
date_ranges = get_weekly_ranges(START_DATE, END_DATE)
print(f"üìÖ Will fetch {len(date_ranges)} weeks of data")
print(f"   From {date_ranges[0][0]} to {date_ranges[-1][1]}")

In [None]:
# Fetch all posts month by month

all_posts = []

for subreddit in SUBREDDITS.keys():
    print(f"\n==================== Subreddit: r/{subreddit} ====================")
    for i, (start, end) in enumerate(date_ranges, 1):
        print(f"\nüìÜ [{i}/{len(date_ranges)}] Fetching {start} to {end}...")
        
        weekly_posts = reddit_scraper.fetch_pushshift_posts(
            subreddit=subreddit,
        start_date=start,
        end_date=end,
        limit=20
    )
    
        if weekly_posts:
            all_posts.extend(weekly_posts)
            print(f"   ‚úÖ Added {len(weekly_posts)} posts (Total: {len(all_posts)})")
        else:
            print(f"   ‚ö†Ô∏è No posts found for this period")
        
        # Rate limiting (be nice to API)
        time.sleep(1)

print(f"\nüéâ Backfill complete!")
print(f"üìä Total posts fetched: {len(all_posts)}")

In [None]:
df = pd.DataFrame(all_posts)

df = df[['subreddit', 'title', 'selftext', 'score', 'num_comments', 'created_utc']]
df['crypto'] = df['subreddit'].map(SUBREDDITS)
df = df[['crypto', 'title', 'selftext', 'score', 'num_comments', 'created_utc']]
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Sauvegarder
output_file = 'reddit_posts_backfill.csv'
df.to_csv(output_file, index=False)

print(f"\n‚úÖ SUCCESS!")
print(f"üìä Total posts: {len(df)}")
print(f"üíæ Saved to: {output_file}")
print(f"\nPosts par crypto:")
print(df['crypto'].value_counts())

In [None]:
import pandas as pd

df = pd.read_csv('reddit_posts_backfill.csv', parse_dates=['created_utc'])
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
df["selftext"] = df["selftext"].replace(['[deleted]', '[removed]'], '')
df["selftext"] = df["selftext"].fillna('')
df['crypto'] = df['crypto'].astype(str)
df['title'] = df['title'].astype(str)
df['selftext'] = df['selftext'].astype(str)

df.dtypes

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

fg = fs.create_feature_group(
    "reddit_posts_backfill",
    version=1,
    description="Reddit posts backfill data",
    primary_key=["created_utc"],
    event_time="created_utc",
    online_enabled=False
    )

fg.save(df)

In [2]:
project = hopsworks.login()
fs = project.get_feature_store()

fg = fs.get_feature_group("reddit_posts_backfill", version=1)

df = fg.read()

df.head()

2026-01-04 00:17:56,076 INFO: Initializing external client
2026-01-04 00:17:56,079 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-04 00:17:58,198 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279131
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.96s) 


Unnamed: 0,crypto,title,selftext,score,num_comments,created_utc
0,SOL,Is anyone staking Sol via Marinade&gt;Saber&gt...,,4,9,2022-02-28 17:44:51+00:00
1,SOL,SOLANA rocks...,What do you guys think about SOL? From the pe...,0,50,2022-08-08 14:48:47+00:00
2,SOL,"Star Atlas' ""Never Alone"": An Immersive Journe...",,12,1,2023-05-12 12:08:07+00:00
3,SOL,Microsoft App Store approved a scammer app dis...,A fake @Ledger Live app on the official @Micro...,1,1,2023-11-05 07:00:00+00:00
4,SOL,SugarRush - An open-source free web app projec...,,1,1,2022-08-23 18:11:23+00:00


# Create the feature of aggregated sentiments

In [3]:
df_sentiment = utils.create_sentiment_table(df)

df_sentiment.head()
df_sentiment.to_csv('reddit_sentiment_backfill.csv', index=False)

Using device: cuda
   Progress: 0/28750 (0%)
   Progress: 320/28750 (1%)
   Progress: 640/28750 (2%)
   Progress: 960/28750 (3%)
   Progress: 1280/28750 (4%)
   Progress: 1600/28750 (5%)
   Progress: 1920/28750 (6%)
   Progress: 2240/28750 (7%)
   Progress: 2560/28750 (8%)
   Progress: 2880/28750 (10%)
   Progress: 3200/28750 (11%)
   Progress: 3520/28750 (12%)
   Progress: 3840/28750 (13%)
   Progress: 4160/28750 (14%)
   Progress: 4480/28750 (15%)
   Progress: 4800/28750 (16%)
   Progress: 5120/28750 (17%)
   Progress: 5440/28750 (18%)
   Progress: 5760/28750 (20%)
   Progress: 6080/28750 (21%)
   Progress: 6400/28750 (22%)
   Progress: 6720/28750 (23%)
   Progress: 7040/28750 (24%)
   Progress: 7360/28750 (25%)
   Progress: 7680/28750 (26%)
   Progress: 8000/28750 (27%)
   Progress: 8320/28750 (28%)
   Progress: 8640/28750 (30%)
   Progress: 8960/28750 (31%)
   Progress: 9280/28750 (32%)
   Progress: 9600/28750 (33%)
   Progress: 9920/28750 (34%)
   Progress: 10240/28750 (35%)
   Pr

In [4]:
df_sentiment.head()

Unnamed: 0,timestamp,sentiment
0,2022-02-28 17:44:51,0
1,2022-08-08 14:48:47,0
2,2023-05-12 12:08:07,0
3,2023-11-05 07:00:00,0
4,2022-08-23 18:11:23,0


In [2]:
df_sentiment = pd.read_csv('reddit_sentiment_backfill.csv', parse_dates=['timestamp'])
df_agg_sentiment = utils.agregate_sentiment_table(df_sentiment)

df_agg_sentiment.head()

Unnamed: 0,date,mean_sentiment,count
0,2020-01-03,0.0,1
1,2020-01-11,0.0,1
2,2020-01-19,0.0,1
3,2020-02-05,0.333333,3
4,2020-02-06,0.0,1


In [3]:
df_agg_sentiment.to_csv('reddit_aggregated_sentiment_backfill.csv', index=False)