In [1]:
import duckdb
import threading
import time
import duckdb
import json
import threading
import time
import os
import pandas as pd
from create_database import (
    add_initial_tables,
    add_comments_to_comments_tables_old,
    add_posts_table,
    add_comments_working_table,
    add_initial_comments_tables,
    add_comments_to_comments_tables,
    cascading_comment_deletion,
    create_lookup_table,
    create_subreddit_tables,
    create_threads_table,
)
from stats import (
    create_row_counts_table,
    get_depth_distribution,
    get_number_of_threads,
    get_thread_score_distribution,
    get_subreddit_distribution,
    table_stats,
    calculate_weighted_average,
    get_thread_lengths,
    get_author_distribution,
    log_with_resources,
)

from filter_database import make_threads_unique, filter_threads, filter_by_score
from get_samples import create_subset_tables

monitoring_active = True


def continuous_resource_monitor(interval=1800):
    while monitoring_active:
        log_with_resources("Monitoring during execution")
        time.sleep(interval)


# Start the background monitoring thread
monitor_thread = threading.Thread(target=continuous_resource_monitor, args=(10,))
monitor_thread.daemon = True  # will exit when main thread exits
monitor_thread.start()

if os.path.exists("../data/database_sample.db"):
    os.remove("../data/database_sample.db")
if os.path.exists("../data/saved_stats.json"):
    os.remove("../data/saved_stats.json")
con = duckdb.connect("../data/database_sample.db")
log_with_resources("initial resources")
con.execute("SET threads TO 5;")
con.execute("PRAGMA verify_parallelism;")
con.execute("PRAGMA memory_limit='32GB';")
log_with_resources("threads set to 20")

[2025-05-03 10:59:07] Monitoring during execution | CPU: 10.0% | Mem: 136.1 MB | Threads: 57
[2025-05-03 10:59:07] initial resources | CPU: 0.0% | Mem: 136.1 MB | Threads: 57
[2025-05-03 10:59:07] threads set to 20 | CPU: 0.0% | Mem: 138.8 MB | Threads: 46


[2025-05-03 10:59:17] Monitoring during execution | CPU: 10.0% | Mem: 164.2 MB | Threads: 31
[2025-05-03 10:59:27] Monitoring during execution | CPU: 0.0% | Mem: 157.2 MB | Threads: 31
[2025-05-03 10:59:37] Monitoring during execution | CPU: 0.0% | Mem: 157.2 MB | Threads: 31
[2025-05-03 10:59:47] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB | Threads: 31
[2025-05-03 10:59:58] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB | Threads: 31
[2025-05-03 11:00:08] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB | Threads: 31
[2025-05-03 11:00:18] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB | Threads: 31
[2025-05-03 11:00:28] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB | Threads: 31
[2025-05-03 11:00:38] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB | Threads: 31
[2025-05-03 11:00:48] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB | Threads: 31
[2025-05-03 11:00:58] Monitoring during execution | CPU: 0.0% | Mem: 157.1 MB |

con = duckdb.connect("../database_backup.db")
# print all tables in the database
print(con.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall())
create_threads_table(con, "unfiltered_threads")

In [2]:
def save_thread_to_csv(thread_row, con):
    """
    Takes a thread row and saves its associated data to two CSV files:
    posts_samples.csv and comments_samples.csv

    Args:
        thread_row: A DataFrame returned from get_random_thread_details
        db_path: Path to the DuckDB database
    """

    # Convert thread_row to DataFrame if it isn't already
    if not isinstance(thread_row, pd.DataFrame):
        thread_row = pd.DataFrame([thread_row])

    # Get all columns from the thread
    columns = thread_row.columns.tolist()

    # Initialize DataFrames for posts and comments
    posts_data = None
    comments_data = []

    # Process each column
    for col in columns:
        id_value = thread_row[col].iloc[0]  # Get the first (and only) value

        # Skip if the ID is null or if it's not an ID column
        if pd.isna(id_value) or col in [
            "id",
            "created_utc",
            "score",
            "num_comments",
        ]:
            continue

        # Get the full row from the corresponding table
        try:
            row_data = con.execute(
                f"SELECT * FROM {col} WHERE id = '{id_value}'"
            ).fetchdf()

            if not row_data.empty:
                if col == "posts":
                    # Save posts data
                    posts_data = row_data
                else:
                    # Accumulate comments data
                    comments_data.append(row_data)
        except Exception as e:
            print(f"Error processing {col} with ID {id_value}: {e}")

    # Save posts data if we have any
    if posts_data is not None:
        # Read existing data if file exists
        if os.path.exists("../data/posts_samples.csv"):
            existing_posts = pd.read_csv("../data/posts_samples.csv")
            posts_data = pd.concat([existing_posts, posts_data], ignore_index=True)

        # Write all data with headers
        posts_data.to_csv("../data/posts_samples.csv", index=False)

    # Save comments data if we have any
    if comments_data:
        # Concatenate all new comment rows
        new_comments = pd.concat(comments_data, ignore_index=True)

        # Read existing data if file exists
        if os.path.exists("../data/comments_samples.csv"):
            existing_comments = pd.read_csv("../data/comments_samples.csv")
            new_comments = pd.concat(
                [existing_comments, new_comments], ignore_index=True
            )

        # Write all data with headers
        new_comments.to_csv("../data/comments_samples.csv", index=False)

if os.path.exists("../data/database_sample.db"):
    os.remove("../data/database_sample.db")
if os.path.exists("../data/posts_samples.csv"):
    os.remove("../data/posts_samples.csv")
if os.path.exists("../data/comments_samples.csv"):
    os.remove("../data/comments_samples.csv")
if os.path.exists("../data/saved_stats.json"):
    os.remove("../data/saved_stats.json")

con = duckdb.connect("../database_backup.db")
for _ in range(3):
    thread = get_random_thread_details("unfiltered_threads", con)
    save_thread_to_csv(thread, "../database_backup.db")
"""for _ in range(1):
    thread = get_random_thread_details("threads", con)
    save_thread_to_csv(thread, "../database_backup.db")
for _ in range(4):
    thread = get_random_thread_details("filtered_threads", con)
    save_thread_to_csv(thread, "../database_backup.db")"""
con.close()

# Add ab_ as a prefix to the column parent_id in the comments_samples.csv file
df = pd.read_csv("../data/comments_samples.csv")
df["parent_id"] = df["parent_id"].str[3:]
df.to_csv("../data/comments_samples.csv", index=False)

In [3]:
con = duckdb.connect("../data/database_sample.db")
add_posts_table(con, "../data/posts_samples.csv")
add_comments_working_table(con, "../data/comments_samples.csv")
add_initial_comments_tables(con)
add_comments_to_comments_tables(con)

[2025-05-03 10:59:07] posts table created successfully. | CPU: 0.0% | Mem: 143.2 MB | Threads: 46
Batch 1: Inserted 60 rows (Total: 60)
Finished loading 60 total rows
Inserted matching comments into comments_to_posts table successfully.
Successfully moved and deleted matching comments
Inserted matching comments into comments_to_comments_1 table successfully.
Successfully moved and deleted matching comments
Found 6 comments for level 2
Created level 2 table and deleted processed rows
Found 4 comments for level 3
Created level 3 table and deleted processed rows
Found 4 comments for level 4
Created level 4 table and deleted processed rows
Found 4 comments for level 5
Created level 5 table and deleted processed rows
Found 3 comments for level 6
Created level 6 table and deleted processed rows
Found 3 comments for level 7
Created level 7 table and deleted processed rows
Found 3 comments for level 8
Created level 8 table and deleted processed rows
Found 3 comments for level 9
Created level 9

In [4]:
# cascading_comment_deletion(con, 10)

In [5]:
create_row_counts_table(con)
df = con.execute("SELECT * FROM row_counts").fetchdf()

df = df.sort_values(by="row_count", ascending=False)
# Pretty-print the DataFrame
print(df.to_string(index=False))
"""
# Row count of comments to posts
total_rows = df.loc[df["table_name"] == "comments_to_posts", "row_count"].values[0]
print(f"Total number of threads: {total_rows}")
# Find the table after which there are only 5% of the rows left
for i in range(1, len(df) - 1):
    if (
        df.loc[df["table_name"] == f"comments_to_comments_{i}", "row_count"].values[0]
        / total_rows
        < 0.3
    ):
        # Get the last character of the table name
        table_number = i
        break
print(
    f"Table after which there are only 5% of the rows left: {f"comments_to_comments_{table_number}"}"
)
# Drop the tables after this table
i = table_number + 1
while True:
    try:
        con.execute(f"DROP TABLE comments_to_comments_{i}")
        i += 1
    except Exception as e:
        print(f"comments_to_comments_{i} does not exist")
        break

cascading_comment_deletion(con, table_number)
create_row_counts_table(con)
df = con.execute("SELECT * FROM row_counts").fetchdf()

df = df.sort_values(by="row_count", ascending=False)
# Pretty-print the DataFrame
print(df.to_string(index=False))
"""

[2025-05-03 10:59:08] Row counts table created and saved to file. | CPU: 0.0% | Mem: 147.3 MB | Threads: 46
             table_name  row_count
                  posts         14
      comments_to_posts         12
 comments_to_comments_1          9
 comments_to_comments_2          6
 comments_to_comments_4          4
 comments_to_comments_3          4
 comments_to_comments_5          4
 comments_to_comments_7          3
 comments_to_comments_9          3
 comments_to_comments_6          3
 comments_to_comments_8          3
comments_to_comments_10          2


'\n# Row count of comments to posts\ntotal_rows = df.loc[df["table_name"] == "comments_to_posts", "row_count"].values[0]\nprint(f"Total number of threads: {total_rows}")\n# Find the table after which there are only 5% of the rows left\nfor i in range(1, len(df) - 1):\n    if (\n        df.loc[df["table_name"] == f"comments_to_comments_{i}", "row_count"].values[0]\n        / total_rows\n        < 0.3\n    ):\n        # Get the last character of the table name\n        table_number = i\n        break\nprint(\n    f"Table after which there are only 5% of the rows left: {f"comments_to_comments_{table_number}"}"\n)\n# Drop the tables after this table\ni = table_number + 1\nwhile True:\n    try:\n        con.execute(f"DROP TABLE comments_to_comments_{i}")\n        i += 1\n    except Exception as e:\n        print(f"comments_to_comments_{i} does not exist")\n        break\n\ncascading_comment_deletion(con, table_number)\ncreate_row_counts_table(con)\ndf = con.execute("SELECT * FROM row_counts

In [6]:
create_lookup_table(con)

[2025-05-03 10:59:08] lookup_table created successfully. | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_posts dropped | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_comments_1 dropped | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_comments_2 dropped | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_comments_3 dropped | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_comments_4 dropped | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_comments_5 dropped | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_comments_6 dropped | CPU: 10.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:08] Temp table comments_to_comments_7 dropped | CPU: 0.0% | Mem: 164.0 MB | Threads: 46
[2025-05-03 10:59:09] Temp table comments_to_comments_8 d

In [7]:
table_stats("lookup_table", con)

[2025-05-03 10:59:09] Processing table lookup_table with 14 rows. | CPU: 0.0% | Mem: 164.0 MB | Threads: 46


[2025-05-03 10:59:09] Finished parallel processing of rows. | CPU: 0.0% | Mem: 164.5 MB | Threads: 31
[2025-05-03 10:59:09] Thread lengths and widths statistics for lookup_table saved to file. | CPU: 0.0% | Mem: 164.5 MB | Threads: 31


In [8]:
calculate_weighted_average("thread_lengths_lookup_table")
calculate_weighted_average("thread_widths_lookup_table")
calculate_weighted_average("all_widths_lookup_table")

[2025-05-03 10:59:09] Weighted average for thread_lengths_lookup_table calculated and saved to file | CPU: 0.0% | Mem: 149.7 MB | Threads: 31
[2025-05-03 10:59:09] Weighted average for thread_widths_lookup_table calculated and saved to file | CPU: 0.0% | Mem: 149.7 MB | Threads: 31
[2025-05-03 10:59:10] Weighted average for all_widths_lookup_table calculated and saved to file | CPU: 10.0% | Mem: 149.7 MB | Threads: 31


In [9]:
create_threads_table(con, "all_threads")

[2025-05-03 10:59:10] Created all_threads table successfully. | CPU: 0.0% | Mem: 161.5 MB | Threads: 31


In [10]:
make_threads_unique(con, "threads")

Columns in lookup_table: ['posts', 'comments_to_posts', 'comments_to_comments_1', 'comments_to_comments_2', 'comments_to_comments_3', 'comments_to_comments_4', 'comments_to_comments_5', 'comments_to_comments_6', 'comments_to_comments_7', 'comments_to_comments_8', 'comments_to_comments_9', 'comments_to_comments_10']
[2025-05-03 10:59:10] Constructed dynamic SQL parts for non-NULL counts and columns. | CPU: 0.0% | Mem: 161.5 MB | Threads: 31
[2025-05-03 10:59:10] Parallel query execution complete. Retrieved 12 rows. | CPU: 0.0% | Mem: 161.5 MB | Threads: 31
[2025-05-03 10:59:10] Created/replaced table threads. | CPU: 0.0% | Mem: 161.5 MB | Threads: 31
[2025-05-03 10:59:10] Inserted unioned results into threads. | CPU: 0.0% | Mem: 161.9 MB | Threads: 31


In [11]:
"""
add_initial_tables(con, "../data/posts.csv", "../data/comments.csv")
add_comments_to_comments_tables_old(con, "../data/comments.csv")
for table in con.execute("SHOW TABLES").fetchdf()["name"]:
    print(f"Table: {table}")
    print(con.execute(f"SELECT COUNT(*) FROM {table}").fetchdf())
    print("\n")

create_row_counts_table(con)
df = con.execute("SELECT * FROM row_counts").fetchdf()

df = df.sort_values(by="row_count", ascending=False)
# Row count of comments to posts
total_rows = df.loc[df["table_name"] == "comments_to_posts", "row_count"].values[0]
print(f"Total number of threads: {total_rows}")
# Find the table after which there are only 5% of the rows left
for i in range(1, len(df) - 1):
    if (
        df.loc[df["table_name"] == f"comments_to_comments_{i}", "row_count"].values[0]
        / total_rows
        < 0.05
    ):
        # Get the last character of the table name
        table_number = i
        break
print(
    f"Table after which there are only 5% of the rows left: {f"comments_to_comments_{table_number}"}"
)
# Drop the tables after this table
i = table_number + 1
while True:
    try:
        con.execute(f"DROP TABLE comments_to_comments_{i}")
        i += 1
    except Exception as e:
        print(f"comments_to_comments_{i} does not exist")
        break

cascading_comment_deletion(con, table_number)
create_row_counts_table(con)

create_lookup_table(con)
create_threads_table(con=con, threads_table="all_threads")

table_stats("lookup_table", con)
calculate_weighted_average("thread_lengths_lookup_table")
calculate_weighted_average("thread_widths_lookup_table")
calculate_weighted_average("all_widths_lookup_table")

make_threads_unique(con, "threads")"""

filter_threads(con, "threads", "threads", num_authors=None)

get_depth_distribution("threads", con)
get_thread_lengths("threads", con)
get_number_of_threads("threads", con)
get_thread_score_distribution("threads", con)
get_subreddit_distribution("threads", con)
get_author_distribution("threads", con)
calculate_weighted_average("depth_distribution_threads")
calculate_weighted_average("author_distribution_threads")
calculate_weighted_average("thread_score_distribution_threads")
calculate_weighted_average("thread_lengths_threads")

# Create subsets with 2,3,4,5 authors
filter_threads(con, "threads", "threads_2_authors", num_authors=2)
filter_threads(con, "threads", "threads_3_authors", num_authors=3)
filter_threads(con, "threads", "threads_4_authors", num_authors=4)
filter_threads(con, "threads", "threads_5_authors", num_authors=5)
for i in range(2, 6):
    get_depth_distribution(f"threads_{i}_authors", con)
    get_thread_lengths(f"threads_{i}_authors", con)
    get_number_of_threads(f"threads_{i}_authors", con)
    get_thread_score_distribution(f"threads_{i}_authors", con)
    get_subreddit_distribution(f"threads_{i}_authors", con)
    calculate_weighted_average(f"depth_distribution_threads_{i}_authors")
    calculate_weighted_average(f"thread_score_distribution_threads_{i}_authors")
    calculate_weighted_average(f"thread_lengths_threads_{i}_authors")

filter_by_score(con, "threads")
for table in ["threads_viral", "threads_non_viral"]:
    get_depth_distribution(table, con)
    get_thread_lengths(table, con)
    get_number_of_threads(table, con)
    get_thread_score_distribution(table, con)
    get_subreddit_distribution(table, con)
    calculate_weighted_average(f"depth_distribution_{table}")
    calculate_weighted_average(f"author_distribution_{table}")
    calculate_weighted_average(f"thread_score_distribution_{table}")
    calculate_weighted_average(f"thread_lengths_{table}")

with open("../data/saved_stats.json", "r") as f:
    existing_data = json.load(f)
distribution = existing_data["subreddit_distribution_threads"]
subreddits = [
    key
    for key, value in sorted(distribution.items(), key=lambda x: x[1], reverse=True)[:5]
]
for subreddit in subreddits:
    create_subreddit_tables(con, subreddit)
    table_stats(f"{subreddit}_lookup", con)
    calculate_weighted_average(f"thread_lengths_{subreddit}_lookup")
    calculate_weighted_average(f"thread_widths_{subreddit}_lookup")
    calculate_weighted_average(f"all_widths_{subreddit}_lookup")
    get_depth_distribution(f"{subreddit}_threads", con)
    get_thread_lengths(f"{subreddit}_threads", con)
    get_number_of_threads(f"{subreddit}_threads", con)
    get_thread_score_distribution(f"{subreddit}_threads", con)
    get_author_distribution(f"{subreddit}_threads", con)
    calculate_weighted_average(f"depth_distribution_{subreddit}_threads")
    calculate_weighted_average(f"author_distribution_{subreddit}_threads")
    calculate_weighted_average(f"thread_score_distribution_{subreddit}_threads")
    calculate_weighted_average(f"thread_lengths_{subreddit}_threads")

[2025-05-03 10:59:13] Deleted 2 rows with deleted content from threads. | CPU: 0.0% | Mem: 153.5 MB | Threads: 31
[2025-05-03 10:59:13] Depth distribution for threads saved to file. | CPU: 0.0% | Mem: 158.0 MB | Threads: 31
[2025-05-03 10:59:13] Thread lengths for threads calculated and saved to file. | CPU: 0.0% | Mem: 158.0 MB | Threads: 31
[2025-05-03 10:59:13] Number of threads for threads saved to file. | CPU: 0.0% | Mem: 158.0 MB | Threads: 31
[2025-05-03 10:59:14] Thread score distribution for threads saved to file. | CPU: 0.0% | Mem: 161.8 MB | Threads: 31
[2025-05-03 10:59:14] Subreddit distribution for threads saved to file. | CPU: 0.0% | Mem: 162.4 MB | Threads: 31
[2025-05-03 10:59:14] Author distribution for threads calculated and saved to file. | CPU: 0.0% | Mem: 172.1 MB | Threads: 31
[2025-05-03 10:59:14] Weighted average for depth_distribution_threads calculated and saved to file | CPU: 0.0% | Mem: 172.1 MB | Threads: 31
[2025-05-03 10:59:14] Weighted average for autho

In [11]:
create_subset_tables(con, "threads")

In [12]:
filter_threads(con, "threads", "threads", num_authors=None)

[2025-04-28 15:11:08] Created empty filtered_threads table. | CPU: 0.0% | Mem: 156.0 MB | Threads: 34
[2025-04-28 15:11:09] Prefetched ID-to-author and ID-to-content mappings. | CPU: 0.0% | Mem: 156.3 MB | Threads: 34
[2025-04-28 15:11:09] Fetched 12 rows from threads. | CPU: 0.0% | Mem: 156.3 MB | Threads: 34
[2025-04-28 15:11:09] Partitioned rows into 1 chunks. | CPU: 0.0% | Mem: 156.3 MB | Threads: 34
[2025-04-28 15:11:09] Filtered rows: 2 valid threads remain. | CPU: 0.0% | Mem: 155.5 MB | Threads: 34
[2025-04-28 15:11:09] Inserted 2 valid rows into filtered_threads. | CPU: 0.0% | Mem: 155.8 MB | Threads: 34


In [13]:
create_subset_tables(con, "filtered_threads")

In [14]:
get_depth_distribution("threads", con)

[2025-04-28 15:11:09] Depth distribution for threads saved to file. | CPU: 0.0% | Mem: 162.9 MB | Threads: 34
[2025-04-28 15:11:09] Depth distribution for filtered_threads saved to file. | CPU: 0.0% | Mem: 164.6 MB | Threads: 34


In [15]:
get_thread_lengths("threads", con)

[2025-04-28 15:11:09] Thread lengths for threads calculated and saved to file. | CPU: 0.0% | Mem: 164.6 MB | Threads: 34
[2025-04-28 15:11:10] Thread lengths for filtered_threads calculated and saved to file. | CPU: 0.0% | Mem: 164.6 MB | Threads: 34


In [16]:
get_number_of_threads("threads", con)

[2025-04-28 15:11:10] Number of threads for threads saved to file. | CPU: 0.0% | Mem: 164.6 MB | Threads: 34
[2025-04-28 15:11:10] Number of threads for filtered_threads saved to file. | CPU: 0.0% | Mem: 164.6 MB | Threads: 34


In [17]:
get_thread_score_distribution("threads", con)

[2025-04-28 15:11:10] Constructed dynamic SQL parts for score calculation. | CPU: 0.0% | Mem: 164.8 MB | Threads: 34
[2025-04-28 15:11:10] Parallel partition queries complete; merging results. | CPU: 0.0% | Mem: 163.3 MB | Threads: 34
[2025-04-28 15:11:10] Thread score distribution for threads saved to file. | CPU: 10.0% | Mem: 160.5 MB | Threads: 34
[2025-04-28 15:11:10] Constructed dynamic SQL parts for score calculation. | CPU: 10.0% | Mem: 160.5 MB | Threads: 34
[2025-04-28 15:11:11] Parallel partition queries complete; merging results. | CPU: 0.0% | Mem: 160.5 MB | Threads: 34
[2025-04-28 15:11:11] Thread score distribution for filtered_threads saved to file. | CPU: 0.0% | Mem: 160.5 MB | Threads: 34


{72951: 1, 47: 1}

In [18]:
get_subreddit_distribution("threads", con)

[2025-04-28 15:11:11] Subreddit distribution for threads saved to file. | CPU: 0.0% | Mem: 162.2 MB | Threads: 34
[2025-04-28 15:11:11] Subreddit distribution for filtered_threads saved to file. | CPU: 0.0% | Mem: 162.7 MB | Threads: 34


In [19]:
get_author_distribution("threads", con)

[2025-04-28 15:11:11] Constructed comment columns list for author distribution. | CPU: 0.0% | Mem: 162.7 MB | Threads: 34
[2025-04-28 15:11:11] Parallel partition queries complete; merging results. | CPU: 0.0% | Mem: 162.0 MB | Threads: 34
[2025-04-28 15:11:11] Author distribution for threads calculated and saved to file. | CPU: 0.0% | Mem: 157.6 MB | Threads: 34
[2025-04-28 15:11:12] Constructed comment columns list for author distribution. | CPU: 0.0% | Mem: 155.8 MB | Threads: 34
[2025-04-28 15:11:12] Parallel partition queries complete; merging results. | CPU: 0.0% | Mem: 155.8 MB | Threads: 34
[2025-04-28 15:11:12] Author distribution for filtered_threads calculated and saved to file. | CPU: 0.0% | Mem: 155.8 MB | Threads: 34


{3: 2}

In [20]:
subreddits = ["AskReddit", "memes", "politics"]
for subreddit in subreddits:
    create_subreddit_tables(con, subreddit)
    table_stats(f"{subreddit}_lookup", con)
    calculate_weighted_average(f"thread_lengths_{subreddit}_lookup")
    calculate_weighted_average(f"thread_widths_{subreddit}_lookup")
    calculate_weighted_average(f"all_widths_{subreddit}_lookup")
    get_depth_distribution(f"{subreddit}_threads", con)
    get_thread_lengths(f"{subreddit}_threads", con)
    get_number_of_threads(f"{subreddit}_threads", con)
    get_thread_score_distribution(f"{subreddit}_threads", con)
    get_author_distribution(f"{subreddit}_threads", con)

[2025-04-28 15:11:12] Created tables for subreddit: AskReddit successfully. | CPU: 0.0% | Mem: 156.7 MB | Threads: 34
[2025-04-28 15:11:12] Thread lengths and widths statistics for AskReddit_lookup saved to file. | CPU: 0.0% | Mem: 156.8 MB | Threads: 34
[2025-04-28 15:11:12] Weighted average for thread_lengths_AskReddit_lookup calculated and saved to file | CPU: 0.0% | Mem: 156.8 MB | Threads: 34
[2025-04-28 15:11:12] Weighted average for thread_widths_AskReddit_lookup calculated and saved to file | CPU: 0.0% | Mem: 156.8 MB | Threads: 34
[2025-04-28 15:11:12] Weighted average for all_widths_AskReddit_lookup calculated and saved to file | CPU: 0.0% | Mem: 156.7 MB | Threads: 34
[2025-04-28 15:11:13] Depth distribution for AskReddit_threads saved to file. | CPU: 10.0% | Mem: 163.3 MB | Threads: 34
[2025-04-28 15:11:13] Depth distribution for filtered_AskReddit_threads saved to file. | CPU: 0.0% | Mem: 163.5 MB | Threads: 34
[2025-04-28 15:11:13] Thread lengths for AskReddit_threads cal

In [21]:
# Add up the row counts for all tables starting with "comments"
comments_tables = con.execute(
    "SELECT table_name FROM information_schema.tables WHERE table_name LIKE 'comments_%'"
).fetchdf()
total_comments = 0
for table in comments_tables["table_name"]:
    count = con.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
    total_comments += count
print(f"Total number of comments across all tables: {total_comments}")

Total number of comments across all tables: 60


In [12]:
# Look at database_sample.db
for table in con.execute("SHOW TABLES").fetchdf()["name"]:
    print(f"Table: {table}")
    print(con.execute(f"SELECT * FROM {table}").fetchdf())
    print("\n")

Table: AskReddit_ids
       id
0  gbj767


Table: AskReddit_lookup
    posts comments_to_posts comments_to_comments_1 comments_to_comments_2  \
0  gbj767         [fp6o3g0]              [fp8jcvm]              [fp9l2d4]   

  comments_to_comments_3 comments_to_comments_4 comments_to_comments_5  \
0              [fp9m0mt]              [fpamy5s]              [fpaoglu]   

  comments_to_comments_6 comments_to_comments_7 comments_to_comments_8  \
0              [fpaq7zt]              [fpaqgwk]              [fpareob]   

  comments_to_comments_9 comments_to_comments_10  
0              [fpasr0k]               [fpatq96]  


Table: AskReddit_threads
    posts comments_to_posts comments_to_comments_1 comments_to_comments_2  \
0  gbj767           fp6o3g0                fp8jcvm                fp9l2d4   

  comments_to_comments_3 comments_to_comments_4 comments_to_comments_5  \
0                fp9m0mt                fpamy5s                fpaoglu   

  comments_to_comments_6 comments_to_comments_7

In [13]:
monitoring_active = False
monitor_thread.join()  # optional, if you want to ensure it has stopped before exiting
log_with_resources("Script finished")
con.commit()
con.close()

[2025-05-03 11:01:18] Script finished | CPU: 0.0% | Mem: 157.1 MB | Threads: 30
