In [4]:
import duckdb
from python_files.create_database import (
    add_posts_table,
    add_comments_working_table,
    add_initial_comments_tables,
    add_comments_to_comments_tables,
    create_lookup_table,
    create_subreddit_tables,
    create_threads_table,
)
from python_files.stats import (
    create_row_counts_table,
    get_depth_distribution,
    get_thread_score_distribution,
    get_subreddit_distribution,
    table_stats,
    calculate_weighted_average,
    get_thread_lengths,
)
from python_files.filter_database import make_threads_unique, filter_threads

con = duckdb.connect("../data_backup/database_test.sdb")

In [2]:
add_posts_table(con, "../posts.csv")

posts table created successfully.


In [2]:
add_comments_working_table(con, "../comments.csv")

Batch 1: Inserted 1000000 rows (Total: 1000000)
Batch 2: Inserted 1000000 rows (Total: 2000000)
Batch 3: Inserted 1000000 rows (Total: 3000000)
Batch 4: Inserted 1000000 rows (Total: 4000000)
Batch 5: Inserted 1000000 rows (Total: 5000000)
Batch 6: Inserted 1000000 rows (Total: 6000000)
Batch 7: Inserted 1000000 rows (Total: 7000000)
Batch 8: Inserted 1000000 rows (Total: 8000000)
Batch 9: Inserted 1000000 rows (Total: 9000000)
Batch 10: Inserted 1000000 rows (Total: 10000000)
Batch 11: Inserted 1000000 rows (Total: 11000000)
Batch 12: Inserted 1000000 rows (Total: 12000000)
Batch 13: Inserted 1000000 rows (Total: 13000000)
Batch 14: Inserted 1000000 rows (Total: 14000000)
Batch 15: Inserted 1000000 rows (Total: 15000000)
Batch 16: Inserted 1000000 rows (Total: 16000000)
Batch 17: Inserted 1000000 rows (Total: 17000000)
Batch 18: Inserted 1000000 rows (Total: 18000000)
Batch 19: Inserted 1000000 rows (Total: 19000000)
Batch 20: Inserted 1000000 rows (Total: 20000000)
Batch 21: Inserted

In [2]:
add_initial_comments_tables(con)

Inserted matching comments into comments_to_posts table successfully.
Successfully moved and deleted matching comments
Inserted matching comments into comments_to_comments_1 table successfully.
Successfully moved and deleted matching comments


In [3]:
add_comments_to_comments_tables(con)

Found 20261089 comments for level 2
Created level 2 table and deleted processed rows
Found 12137131 comments for level 3
Created level 3 table and deleted processed rows
Found 7698504 comments for level 4
Created level 4 table and deleted processed rows
Found 4990124 comments for level 5
Created level 5 table and deleted processed rows
Found 3305422 comments for level 6
Created level 6 table and deleted processed rows
Found 2221573 comments for level 7
Created level 7 table and deleted processed rows
Found 1514872 comments for level 8
Created level 8 table and deleted processed rows
Found 1053327 comments for level 9
Created level 9 table and deleted processed rows
Found 761069 comments for level 10
Created level 10 table and deleted processed rows
Found 506245 comments for level 11
Created level 11 table and deleted processed rows
Found 358376 comments for level 12
Created level 12 table and deleted processed rows
Found 265046 comments for level 13
Created level 13 table and deleted p

In [4]:
create_row_counts_table(con)
df = con.execute("SELECT * FROM row_counts").fetchdf()

df = df.sort_values(by="row_count", ascending=False)
# Pretty-print the DataFrame
print(df.to_string(index=False))

('comments_to_comments_1',)
             table_name  row_count
                  posts         14
      comments_to_posts         14
 comments_to_comments_1         10
 comments_to_comments_2          7
 comments_to_comments_4          5
 comments_to_comments_3          5
 comments_to_comments_5          5
 comments_to_comments_7          3
 comments_to_comments_9          3
 comments_to_comments_6          3
 comments_to_comments_8          3
comments_to_comments_10          2


In [5]:
create_lookup_table(con)

Valid hierarchical tables: ['posts', 'comments_to_posts', 'comments_to_comments_1', 'comments_to_comments_2', 'comments_to_comments_3', 'comments_to_comments_4', 'comments_to_comments_5', 'comments_to_comments_6', 'comments_to_comments_7', 'comments_to_comments_8', 'comments_to_comments_9', 'comments_to_comments_10']
lookup_table created successfully.


In [6]:
table_stats("lookup_table", con)
calculate_weighted_average("thread_lengths_lookup_table")
calculate_weighted_average("thread_widths_lookup_table")
calculate_weighted_average("all_widths_lookup_table")

In [7]:
create_threads_table(con, "all_threads")
make_threads_unique(con, "threads")

In [8]:
filter_threads(con, "threads")

In [9]:
get_depth_distribution("filtered_threads", con)
get_thread_lengths("filtered_threads", con)

In [10]:
get_thread_score_distribution("threads", con)
get_thread_score_distribution("filtered_threads", con)

In [11]:
get_subreddit_distribution("threads", con)
get_subreddit_distribution("filtered_threads", con)

In [12]:
subreddits = ["AskReddit", "memes", "politics"]
for subreddit in subreddits:
    create_subreddit_tables(con, subreddit)
    table_stats(f"{subreddit}_lookup", con)
    calculate_weighted_average(f"thread_lengths_{subreddit}_lookup")
    calculate_weighted_average(f"thread_widths_{subreddit}_lookup")
    calculate_weighted_average(f"all_widths_{subreddit}_lookup")
    table_stats(f"{subreddit}_threads", con)
    table_stats(f"filtered_{subreddit}_threads", con)

No data found for thread_lengths_politics_lookup
No data found for thread_widths_politics_lookup
No data found for all_widths_politics_lookup


In [7]:
# Look at database_sample.db
for table in con.execute("SHOW TABLES").fetchdf()["name"]:
    print(f"Table: {table}")
    print(con.execute(f"SELECT COUNT(*) FROM {table}").fetchdf())
    print("\n")

Table: comments_to_comments_1
   count_star()
0      36726814


Table: comments_to_comments_10
   count_star()
0        761069


Table: comments_to_comments_100
   count_star()
0           640


Table: comments_to_comments_101
   count_star()
0           626


Table: comments_to_comments_102
   count_star()
0           607


Table: comments_to_comments_103
   count_star()
0           601


Table: comments_to_comments_104
   count_star()
0           592


Table: comments_to_comments_105
   count_star()
0           591


Table: comments_to_comments_106
   count_star()
0           569


Table: comments_to_comments_107
   count_star()
0           564


Table: comments_to_comments_108
   count_star()
0           562


Table: comments_to_comments_109
   count_star()
0           550


Table: comments_to_comments_11
   count_star()
0        506245


Table: comments_to_comments_110
   count_star()
0           532


Table: comments_to_comments_111
   count_star()
0           523


Table: comment

In [8]:
con.commit()
con.close()