In [1]:
import duckdb
from create_database import (
    add_initial_tables,
    add_comments_to_comments_tables,
    create_lookup_table,
    create_subreddit_tables,
    create_threads_table,
)
from stats import (
    create_row_counts_table,
    create_filtered_row_counts,
    analyze_thread_score_distribution,
    get_subreddit_distribution,
    table_stats,
    calculate_weighted_average,
    get_thread_lengths,
)
from filter_database import make_threads_unique, filter_threads

con = duckdb.connect("../data/database_sample.db")

con = duckdb.connect("../database_backup.db")
# print all tables in the database
print(con.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall())
create_threads_table(con, "unfiltered_threads")

In [2]:
import duckdb
import pandas as pd
import os
from get_samples import get_random_thread_details


def save_thread_to_csv(thread_row, db_path):
    """
    Takes a thread row and saves its associated data to two CSV files:
    posts_samples.csv and comments_samples.csv

    Args:
        thread_row: A DataFrame returned from get_random_thread_details
        db_path: Path to the DuckDB database
    """
    con = duckdb.connect(db_path)

    try:
        # Convert thread_row to DataFrame if it isn't already
        if not isinstance(thread_row, pd.DataFrame):
            thread_row = pd.DataFrame([thread_row])

        # Get all columns from the thread
        columns = thread_row.columns.tolist()

        # Initialize DataFrames for posts and comments
        posts_data = None
        comments_data = []

        # Process each column
        for col in columns:
            id_value = thread_row[col].iloc[0]  # Get the first (and only) value

            # Skip if the ID is null or if it's not an ID column
            if pd.isna(id_value) or col in [
                "id",
                "created_utc",
                "score",
                "num_comments",
            ]:
                continue

            # Get the full row from the corresponding table
            try:
                row_data = con.execute(
                    f"SELECT * FROM {col} WHERE id = '{id_value}'"
                ).fetchdf()

                if not row_data.empty:
                    if col == "posts":
                        # Save posts data
                        posts_data = row_data
                    else:
                        # Accumulate comments data
                        comments_data.append(row_data)
            except Exception as e:
                print(f"Error processing {col} with ID {id_value}: {e}")

        # Save posts data if we have any
        if posts_data is not None:
            # Read existing data if file exists
            if os.path.exists("../data/posts_samples.csv"):
                existing_posts = pd.read_csv("../data/posts_samples.csv")
                posts_data = pd.concat([existing_posts, posts_data], ignore_index=True)

            # Write all data with headers
            posts_data.to_csv("../data/posts_samples.csv", index=False)

        # Save comments data if we have any
        if comments_data:
            # Concatenate all new comment rows
            new_comments = pd.concat(comments_data, ignore_index=True)

            # Read existing data if file exists
            if os.path.exists("../data/comments_samples.csv"):
                existing_comments = pd.read_csv("../data/comments_samples.csv")
                new_comments = pd.concat(
                    [existing_comments, new_comments], ignore_index=True
                )

            # Write all data with headers
            new_comments.to_csv("../data/comments_samples.csv", index=False)

    finally:
        con.close()

In [3]:
if os.path.exists("../data/database_sample.db"):
    os.remove("../data/database_sample.db")
if os.path.exists("../data/posts_samples.csv"):
    os.remove("../data/posts_samples.csv")
if os.path.exists("../data/comments_samples.csv"):
    os.remove("../data/comments_samples.csv")
if os.path.exists("../data/saved_stats.json"):
    os.remove("../data/saved_stats.json")

con = duckdb.connect("../database_backup.db")
for _ in range(10):
    thread = get_random_thread_details("unfiltered_threads", con)
    save_thread_to_csv(thread, "../database_backup.db")
for _ in range(2):
    thread = get_random_thread_details("threads", con)
    save_thread_to_csv(thread, "../database_backup.db")
for _ in range(2):
    thread = get_random_thread_details("filtered_threads", con)
    save_thread_to_csv(thread, "../database_backup.db")
con.close()

Random thread selected:
posts: gcyn92
comments_to_posts: fpe6ssu


Full information for posts (ID: gcyn92):
created_utc: 1588541380
id: gcyn92
name: t3_gcyn92
title: Fortnite is not running (PC)? I get this error, how do I fix this?
selftext: NULL
subreddit: FortNiteBR
score: 2
upvote_ratio: 1.0
num_comments: 10
archived: True
author: fariamim322
distinguished: NULL
media: NULL

--------------------------------------------------------------------------------

Full information for comments_to_posts (ID: fpe6ssu):
created_utc: 1588541500
id: fpe6ssu
body:
    Go to your file browser  On the left, at "This PC", right click and go to properties  You'll get
    some info about your PC  What does "System Type" say?  If it says anything other that 64 bit...
    Your PC doesn't support Fortnite  Fortnite requires a 64-bit version of Windows. If you don't
    have that, your PC cannot run it. You'll need a new one

score: 1
author: NomNomNomNation
parent_id: gcyn92

----------------------------

In [2]:
con = duckdb.connect("../data/database_sample.db")

In [3]:
add_initial_tables(con, "../data/posts_samples.csv", "../data/comments_samples.csv")
add_comments_to_comments_tables(con, "../data/comments_samples.csv")

Found 7 comments for level 2
Created comments_to_comments_2 table in database
Found 5 comments for level 3
Created comments_to_comments_3 table in database
Found 5 comments for level 4
Created comments_to_comments_4 table in database
Found 5 comments for level 5
Created comments_to_comments_5 table in database
Found 3 comments for level 6
Created comments_to_comments_6 table in database
Found 3 comments for level 7
Created comments_to_comments_7 table in database
Found 3 comments for level 8
Created comments_to_comments_8 table in database
Found 3 comments for level 9
Created comments_to_comments_9 table in database
Found 2 comments for level 10
Created comments_to_comments_10 table in database
Found 0 comments for level 11
No more nested comments found after level 10


In [4]:
create_row_counts_table(con)
df = con.execute("SELECT * FROM row_counts").fetchdf()

df = df.sort_values(by="row_count", ascending=False)
# Pretty-print the DataFrame
print(df.to_string(index=False))

('comments_to_comments_1',)
             table_name  row_count
                  posts         14
      comments_to_posts         14
 comments_to_comments_1         10
 comments_to_comments_2          7
 comments_to_comments_4          5
 comments_to_comments_3          5
 comments_to_comments_5          5
 comments_to_comments_7          3
 comments_to_comments_9          3
 comments_to_comments_6          3
 comments_to_comments_8          3
comments_to_comments_10          2


In [5]:
create_lookup_table(con)

Valid hierarchical tables: ['posts', 'comments_to_posts', 'comments_to_comments_1', 'comments_to_comments_2', 'comments_to_comments_3', 'comments_to_comments_4', 'comments_to_comments_5', 'comments_to_comments_6', 'comments_to_comments_7', 'comments_to_comments_8', 'comments_to_comments_9', 'comments_to_comments_10']
lookup_table created successfully.


In [6]:
table_stats("lookup_table", con)
calculate_weighted_average("thread_lengths_lookup_table")
calculate_weighted_average("thread_widths_lookup_table")
calculate_weighted_average("all_widths_lookup_table")

In [7]:
create_threads_table(con, "all_threads")
make_threads_unique(con, "threads")

In [8]:
filter_threads(con, "threads")

In [9]:
create_filtered_row_counts("filtered_threads", con)
get_thread_lengths("filtered_threads", con)

In [10]:
analyze_thread_score_distribution("threads", con)
analyze_thread_score_distribution("filtered_threads", con)

In [11]:
get_subreddit_distribution("threads", con)
get_subreddit_distribution("filtered_threads", con)

In [12]:
subreddits = ["AskReddit", "memes", "politics"]
for subreddit in subreddits:
    create_subreddit_tables(con, subreddit)
    table_stats(f"{subreddit}_lookup", con)
    calculate_weighted_average(f"thread_lengths_{subreddit}_lookup")
    calculate_weighted_average(f"thread_widths_{subreddit}_lookup")
    calculate_weighted_average(f"all_widths_{subreddit}_lookup")
    table_stats(f"{subreddit}_threads", con)
    table_stats(f"filtered_{subreddit}_threads", con)

No data found for thread_lengths_politics_lookup
No data found for thread_widths_politics_lookup
No data found for all_widths_politics_lookup


In [13]:
# Look at database_sample.db
for table in con.execute("SHOW TABLES").fetchdf()["name"]:
    print(f"Table: {table}")
    print(con.execute(f"SELECT * FROM {table}").fetchdf())
    print("\n")

Table: AskReddit_ids
       id
0  gbj767


Table: AskReddit_lookup
    posts comments_to_posts comments_to_comments_1 comments_to_comments_2  \
0  gbj767         [fp6o3g0]              [fp8jcvm]              [fp9l2d4]   

  comments_to_comments_3 comments_to_comments_4 comments_to_comments_5  \
0              [fp9m0mt]              [fpamy5s]              [fpaoglu]   

  comments_to_comments_6 comments_to_comments_7 comments_to_comments_8  \
0              [fpaq7zt]              [fpaqgwk]              [fpareob]   

  comments_to_comments_9 comments_to_comments_10  
0              [fpasr0k]               [fpatq96]  


Table: AskReddit_threads
    posts comments_to_posts comments_to_comments_1 comments_to_comments_2  \
0  gbj767           fp6o3g0                fp8jcvm                fp9l2d4   

  comments_to_comments_3 comments_to_comments_4 comments_to_comments_5  \
0                fp9m0mt                fpamy5s                fpaoglu   

  comments_to_comments_6 comments_to_comments_7

In [14]:
con.commit()
con.close()