In [1]:
import duckdb
import json
import threading
import time
from create_database import (
    create_lookup_table,
    create_subreddit_tables,
    create_threads_table,
)
from stats import (
    create_row_counts_table,
    get_depth_distribution,
    get_number_of_threads,
    get_thread_score_distribution,
    get_subreddit_distribution,
    table_stats,
    calculate_weighted_average,
    get_thread_lengths,
    get_author_distribution,
    log_with_resources,
)

from filter_database import make_threads_unique, filter_threads

monitoring_active = True


def continuous_resource_monitor(interval=1800):
    while monitoring_active:
        log_with_resources("Monitoring during execution")
        time.sleep(interval)


# Start the background monitoring thread
monitor_thread = threading.Thread(target=continuous_resource_monitor, args=(10,))
monitor_thread.daemon = True  # will exit when main thread exits
monitor_thread.start()

con = duckdb.connect("../data/database.db")
log_with_resources("initial resources")
con.execute("SET threads TO 20;")
con.execute("PRAGMA verify_parallelism;")
log_with_resources("threads set to 20")

[2025-04-08 08:53:22] Monitoring during execution | CPU: 100.0% | Mem: 176.4 MB | Threads: 42
[2025-04-08 08:53:23] initial resources | CPU: 3.0% | Mem: 177.4 MB | Threads: 57
[2025-04-08 08:53:24] threads set to 20 | CPU: 1.0% | Mem: 178.6 MB | Threads: 61


[2025-04-08 08:53:33] Monitoring during execution | CPU: 0.0% | Mem: 185.2 MB | Threads: 61
[2025-04-08 08:53:44] Monitoring during execution | CPU: 0.0% | Mem: 185.2 MB | Threads: 61
[2025-04-08 08:53:55] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:54:06] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:54:17] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:54:28] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:54:39] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:54:50] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:55:01] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:55:12] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | Threads: 61
[2025-04-08 08:55:23] Monitoring during execution | CPU: 0.0% | Mem: 186.3 MB | 

In [4]:
create_row_counts_table(con)

[2025-04-08 08:55:56] Row counts table created and saved to file. | CPU: 0.0% | Mem: 186.3 MB | Threads: 61


In [3]:
create_lookup_table(con)

Valid hierarchical tables: ['posts', 'comments_to_posts', 'comments_to_comments_1', 'comments_to_comments_2', 'comments_to_comments_3', 'comments_to_comments_4', 'comments_to_comments_5', 'comments_to_comments_6', 'comments_to_comments_7', 'comments_to_comments_8', 'comments_to_comments_9', 'comments_to_comments_10', 'comments_to_comments_11', 'comments_to_comments_12', 'comments_to_comments_13', 'comments_to_comments_14', 'comments_to_comments_15', 'comments_to_comments_16', 'comments_to_comments_17', 'comments_to_comments_18', 'comments_to_comments_19', 'comments_to_comments_20', 'comments_to_comments_21', 'comments_to_comments_22', 'comments_to_comments_23', 'comments_to_comments_24', 'comments_to_comments_25', 'comments_to_comments_26', 'comments_to_comments_27', 'comments_to_comments_28', 'comments_to_comments_29', 'comments_to_comments_30', 'comments_to_comments_31', 'comments_to_comments_32', 'comments_to_comments_33', 'comments_to_comments_34', 'comments_to_comments_35', 'comme

[2025-04-07 16:41:09] Monitoring during execution | CPU: 286.9% | Mem: 1527.0 MB | Threads: 61
[2025-04-07 16:41:20] Monitoring during execution | CPU: 1441.8% | Mem: 8004.5 MB | Threads: 61
[2025-04-07 16:41:31] Monitoring during execution | CPU: 1185.6% | Mem: 7724.1 MB | Threads: 61
[2025-04-07 16:41:42] Monitoring during execution | CPU: 529.3% | Mem: 6486.1 MB | Threads: 61
[2025-04-07 16:41:53] Monitoring during execution | CPU: 0.0% | Mem: 6646.7 MB | Threads: 61
[2025-04-07 16:42:04] Monitoring during execution | CPU: 0.0% | Mem: 6646.7 MB | Threads: 61
[2025-04-07 16:42:15] Monitoring during execution | CPU: 1.0% | Mem: 6646.7 MB | Threads: 61
[2025-04-07 16:42:26] Monitoring during execution | CPU: 0.0% | Mem: 6646.7 MB | Threads: 61
[2025-04-07 16:42:37] Monitoring during execution | CPU: 0.0% | Mem: 6646.7 MB | Threads: 61
[2025-04-07 16:42:48] Monitoring during execution | CPU: 0.0% | Mem: 6646.7 MB | Threads: 61
[2025-04-07 16:42:59] Monitoring during execution | CPU: 0.0

RuntimeError: Query interrupted

In [None]:
table_stats("lookup_table", con)
calculate_weighted_average("thread_lengths_lookup_table")
calculate_weighted_average("thread_widths_lookup_table")
calculate_weighted_average("all_widths_lookup_table")

In [None]:
create_threads_table(con, "all_threads")

In [None]:
make_threads_unique(con, "threads")
filter_threads(con, "threads", num_authors=3)

In [None]:
get_depth_distribution("threads", con)
get_depth_distribution("filtered_threads", con)
get_thread_lengths("threads", con)
get_thread_lengths("filtered_threads", con)
get_number_of_threads("threads", con)
get_number_of_threads("filtered_threads", con)
get_thread_score_distribution("threads", con)
get_thread_score_distribution("filtered_threads", con)
get_subreddit_distribution("threads", con)
get_subreddit_distribution("filtered_threads", con)
get_author_distribution("threads", con)
get_author_distribution("filtered_threads", con)

In [None]:
with open("../data/saved_stats.json", "r") as f:
    existing_data = json.load(f)
distribution = existing_data["subreddit_distribution_threads"]
subreddits = [
    key
    for key, value in sorted(distribution.items(), key=lambda x: x[1], reverse=True)[:5]
]
for subreddit in subreddits:
    create_subreddit_tables(con, subreddit)
    table_stats(f"{subreddit}_lookup", con)
    calculate_weighted_average(f"thread_lengths_{subreddit}_lookup")
    calculate_weighted_average(f"thread_widths_{subreddit}_lookup")
    calculate_weighted_average(f"all_widths_{subreddit}_lookup")
    get_depth_distribution(f"{subreddit}_threads", con)
    get_depth_distribution(f"filtered_{subreddit}_threads", con)
    get_thread_lengths(f"{subreddit}_threads", con)
    get_thread_lengths(f"filtered_{subreddit}_threads", con)
    get_number_of_threads(f"{subreddit}_threads", con)
    get_number_of_threads(f"filtered_{subreddit}_threads", con)
    get_thread_score_distribution(f"{subreddit}_threads", con)
    get_thread_score_distribution(f"filtered_{subreddit}_threads", con)
    get_author_distribution(f"{subreddit}_threads", con)
    get_author_distribution(f"filtered_{subreddit}_threads", con)

In [5]:
monitoring_active = False
monitor_thread.join()  # optional, if you want to ensure it has stopped before exiting
log_with_resources("Script finished")
con.commit()
con.close()

[2025-04-08 08:56:40] Script finished | CPU: 0.0% | Mem: 186.4 MB | Threads: 60
