Preprocesses the Pushshfit comments data to the Community2Vec usable format, removing deleted comments/users and a specified percentage of the top most active users (naive bot removal).

In [0]:
pushshift_json = "/mnt/s3bucket/community2vec/raw_data/sample_data.bz2"
#pushshift_json = "/mnt/s3bucket/community2vec/raw_data/RC_2021-06.bz2"
pushshift_filename = pushshift_json.split("/")[-1]

stamp = "01282022"
spark_mnt_path = "/mnt/s3bucket/community2vec"
dbfs_mnt_path = "/dbfs" + spark_mnt_path


In [0]:
display(dbutils.fs.ls("/mnt/s3bucket/community2vec/raw_data"))

path,name,size
dbfs:/mnt/s3bucket/community2vec/raw_data/sample_data.bz2,sample_data.bz2,1120604


In [0]:
import ihop.import_data as ihopid

spark_df = ihopid.get_spark_dataframe(pushshift_json, spark, ihopid.COMMENTS)
top_n_df = ihopid.get_top_n_counts(spark_df)
filtered_df = ihopid.filter_top_n(spark_df, top_n_df)
filtered_df = ihopid.remove_deleted_authors(filtered_df)
display(spark_df)
display(top_n_df)
display(filtered_df)

pandas_top_n_df = top_n_df.toPandas()

# Try several different values for excluding percentage of top active users
for exclude_top_user_perc in [0.0, 0.02, 0.05, 0.10]:
    out_dirname = "_".join([pushshift_filename.split(".")[0], f"{int(exclude_top_user_perc*100)}percentTopUsersExcluded", stamp])
    print("Output folder on S3:", out_dirname)
    top_n_out = f"{dbfs_mnt_path}/{out_dirname}/subreddit_counts.csv"
    print("Top N subreddits writing to:", top_n_out)
    context_corpus_out = f"{spark_mnt_path}/{out_dirname}/user_contexts"
    print("User contexts writing to:", context_corpus_out)
    context_word_df = ihopid.aggregate_for_vectorization(filtered_df, exclude_top_perc = exclude_top_user_perc)
    context_word_df.write.mode("overwrite").option("compression", "bzip2").csv(context_corpus_out)
    pandas_top_n_df.to_csv(top_n_out, index=False)
    print()
    

id,parent_id,score,link_id,author,subreddit,body,created_utc
dnqik14,t3_73ieyz,3,t3_73ieyz,Dethcola,sandiego,A quarry,1506816000
dnqik15,t1_dnqiiv7,3,t3_73g740,PennyBotV2,RWBY,[Salutations! I'm not sure what you said.](http://imgur.com/9TtaInH),1506816000
dnqik16,t1_dnqc3lu,2,t3_73hlwn,Sir_Firebum,baseball,I got into baseball at about he same time Matt Cain started playing in the majors. Crazy to see him go. I teared up a bit too.,1506816000
dnqik17,t1_dnqdo99,18,t3_73gw9b,deanzynut,2007scape,FUCKING TORY,1506816000
dnqik18,t3_73i6z3,1,t3_73i6z3,OfullOstomacheO,mildlyinteresting,I see a water dragon,1506816000
dnqik19,t1_dnq4z9q,1,t3_73g65l,PlusOn3,Cubers,Wait. The Michigan what? Where is this? Is this like U of M club or a just state of Michigan?,1506816000
dnqik1a,t1_dnqijyp,2,t3_73hvr0,yeee_bot,teenagers,ye fam,1506816000
dnqik1b,t3_73dvyh,1,t3_73dvyh,grrrrreat,4chan4trump,143417804| > United States Anonymous (ID: LIAKFEVH) >>143412250 (OP) oldfag here 2016: Hillary 2012: Obama 2008: Obama 2004: Kerry 2000: Buchanan 1996: Dole 1992: Bush 1988: Bush 1984: Reagan,1506816000
dnqik1c,t3_73hgz4,2,t3_73hgz4,psych4191,CFB,That is some chicken salad outta chicken shit running.,1506816000
dnqik1d,t1_dnqick7,1,t3_73feje,fishboy2000,rugbyunion,Does he even know the rules?,1506816000


subreddit,count
AskReddit,486
CFB,403
CrazyIdeas,261
news,158
ConciseIAmA,147
4chan4trump,136
politics,117
RocketLeagueExchange,96
The_Donald,90
nba,90


subreddit,id,parent_id,score,link_id,author,body,created_utc,count
sandiego,dnqik14,t3_73ieyz,3,t3_73ieyz,Dethcola,A quarry,1506816000,1
RWBY,dnqik15,t1_dnqiiv7,3,t3_73g740,PennyBotV2,[Salutations! I'm not sure what you said.](http://imgur.com/9TtaInH),1506816000,6
baseball,dnqik16,t1_dnqc3lu,2,t3_73hlwn,Sir_Firebum,I got into baseball at about he same time Matt Cain started playing in the majors. Crazy to see him go. I teared up a bit too.,1506816000,47
2007scape,dnqik17,t1_dnqdo99,18,t3_73gw9b,deanzynut,FUCKING TORY,1506816000,22
mildlyinteresting,dnqik18,t3_73i6z3,1,t3_73i6z3,OfullOstomacheO,I see a water dragon,1506816000,33
Cubers,dnqik19,t1_dnq4z9q,1,t3_73g65l,PlusOn3,Wait. The Michigan what? Where is this? Is this like U of M club or a just state of Michigan?,1506816000,3
teenagers,dnqik1a,t1_dnqijyp,2,t3_73hvr0,yeee_bot,ye fam,1506816000,69
4chan4trump,dnqik1b,t3_73dvyh,1,t3_73dvyh,grrrrreat,143417804| > United States Anonymous (ID: LIAKFEVH) >>143412250 (OP) oldfag here 2016: Hillary 2012: Obama 2008: Obama 2004: Kerry 2000: Buchanan 1996: Dole 1992: Bush 1988: Bush 1984: Reagan,1506816000,136
CFB,dnqik1c,t3_73hgz4,2,t3_73hgz4,psych4191,That is some chicken salad outta chicken shit running.,1506816000,403
rugbyunion,dnqik1d,t1_dnqick7,1,t3_73feje,fishboy2000,Does he even know the rules?,1506816000,44
