# Bag of Words pre-processing
This notebook pre-processes Reddit submissions and json data to a format more easily used for bag of words topic and clustering models.

In [0]:
display(dbutils.fs.ls("/mnt/s3bucket/reddit/raw_data"))
display(dbutils.fs.ls("/mnt/s3bucket/reddit/raw_data/comments"))
display(dbutils.fs.ls("/mnt/s3bucket/reddit/raw_data/submissions"))

path,name,size
dbfs:/mnt/s3bucket/reddit/raw_data/comments/,comments/,0
dbfs:/mnt/s3bucket/reddit/raw_data/submissions/,submissions/,0


path,name,size
dbfs:/mnt/s3bucket/reddit/raw_data/comments/RC_2021-05.bz2,RC_2021-05.bz2,30956683935
dbfs:/mnt/s3bucket/reddit/raw_data/comments/RC_2021-06.bz2,RC_2021-06.bz2,29759838678
dbfs:/mnt/s3bucket/reddit/raw_data/comments/sample_data.bz2,sample_data.bz2,1120604


path,name,size
dbfs:/mnt/s3bucket/reddit/raw_data/submissions/RS_2021-05.bz2,RS_2021-05.bz2,15083739147
dbfs:/mnt/s3bucket/reddit/raw_data/submissions/RS_2021-06.bz2,RS_2021-06.bz2,14656369883


In [0]:
import ihop.import_data as ihopid
pushshift_submissions = "/mnt/s3bucket/reddit/raw_data/submissions/RS*.bz2"
pushshift_comments =  "/mnt/s3bucket/reddit/raw_data/comments/RC*.bz2"

stamp = "02102022"
output_root = "/mnt/s3bucket/reddit/bagOfWords"
output_basename = f"2021-05_to_2021-06_joined_submissions_comments"

In [0]:
comments_df = spark.read.option("mode", "FAILFAST").option("encoding", "UTF-8").schema(ihopid.SCHEMAS[ihopid.COMMENTS]).json(pushshift_comments)
display(comments_df)

In [0]:
submissions_df = spark.read.option("mode", "FAILFAST").option("encoding", "UTF-8").schema("author STRING, created_utc STRING, id STRING, score INTEGER, selftext STRING, title STRING, url STRING, subreddit STRING").json(pushshift_submissions)
display(submissions_df)

In [0]:
for drop_user_perc in [0.0, 0.02, 0.05, 0.10]:
    output_parquet = f"{output_root}/{output_basename}_{int(drop_user_perc*100)}percentTopUsersExcludedFromComments_{stamp}.parquet"
    print("Output will be written to:", output_parquet)
    top_n_df = ihopid.get_top_n_counts(comments_df)

    filtered_comments = ihopid.remove_deleted_authors(ihopid.remove_rows_with_deleted_text(ihopid.filter_top_n(comments_df, top_n_df), ihopid.COMMENTS))
    if drop_user_perc > 0.0:
        filtered_comments = ihopid.filter_out_top_users(filtered_comments, exclude_top_perc=drop_user_perc)

    filtered_submissions = ihopid.remove_deleted_authors(ihopid.remove_rows_with_deleted_text(ihopid.filter_top_n(submissions_df, top_n_df), ihopid.SUBMISSIONS))

    #print("Submissions stats after filtering")
    #ihopid.print_comparison_stats(submissions_df, filtered_submissions, top_n_df)

    filtered_submissions = ihopid.prefix_id_column(filtered_submissions)
    joined_df = ihopid.join_submissions_and_comments(filtered_submissions, filtered_comments)
    #display(joined_df)
    print("Writing output to", output_parquet)
    joined_df.write.mode("overwrite").parquet(output_parquet)
    
