In [30]:
import os
import jsonlines
import pandas as pd
curio_data_path = "data/Curio/"
users = jsonlines.Reader(open(curio_data_path + "users.json"))
posts = jsonlines.Reader(open(curio_data_path + "posts.json"))
groups = jsonlines.Reader(open(curio_data_path + "groups.json"))
reactions = jsonlines.Reader(open(curio_data_path + "reactions.json"))
echopostproposals = jsonlines.Reader(open(curio_data_path + "echopostproposals.json"))

In [None]:
reddit_votes = pd.read_csv("data/reddit/44_million_votes.txt", sep = "\t")
reddit_subs = pd.read_csv("data/reddit/submission_info.txt", sep = "\t")

In [14]:
reddit_subs.head()

Unnamed: 0,SUBMISSION_ID,SUBREDDIT,TITLE,AUTHOR,#_COMMENTS,NSFW,SCORE,UPVOTED_%,LINK
0,t3_e7gje4,houston,downtown houston at sunset,adamisraelvaladez,32,,952,0.97,/r/houston/comments/e7gje4/downtown_houston_at...
1,t3_d8vv6h,ReagentTesting,Mescaline Liebermann inconsistency,drugsmom,0,,2,1.0,/r/ReagentTesting/comments/d8vv6h/mescaline_li...
2,t3_6gdv4s,legaladvice,What does this mean?,Alreadyownanacxount,3,,0,0.33,/r/legaladvice/comments/6gdv4s/what_does_this_...
3,t3_bsh6pw,analog,untitled | nikon f100 | ilford hp5,24rocketman,43,,1666,0.99,/r/analog/comments/bsh6pw/untitled_nikon_f100_...
4,t3_1xuutf,PS4,Unlimited ps+ trials,noahtessier,1,,0,0.18,/r/PS4/comments/1xuutf/unlimited_ps_trials/


In [21]:
reddit_votes.tail()

Unnamed: 0,SUBMISSION_ID,SUBREDDIT,CREATED_TIME,USERNAME,VOTE
44863242,t3_gxn5ln,r/nsfw_gifs,1591432000.0,PDSM0811,upvote
44863243,t3_gxnai3,r/BreedingMaterial,1591433000.0,PDSM0811,upvote
44863244,t3_gxnf8c,r/rule34,1591433000.0,PDSM0811,upvote
44863245,t3_gxni2s,r/hentai,1591434000.0,PDSM0811,upvote
44863246,t3_gxnn09,r/cumtributes_private,1591435000.0,PDSM0811,upvote


In [31]:
group_id_members_map = {}
group_id_name_map = {}
for group in groups:
    group_id = group['_id']["$oid"]
    group_name = group["name"]
    group_id_name_map[group_id] = group_name
    members = [_["$oid"] for _ in group['members']]
    group_id_members_map[group_id] = members
    
user_id_name_map = {}
for user in users:
    user_id = user['_id']["$oid"]
    user_name = user["displayName"]
    user_id_name_map[user_id] = user_name

In [39]:
from collections import defaultdict
posts_info_map = defaultdict(dict)
for post in posts:
    group_id = post["group"]["$oid"]
    group_name = group_id_name_map[group_id]
    if "TEST" in group_name or "test" in group_name:
        continue
    
    post_id = post["_id"]["$oid"]
    post_info = posts_info_map[post_id] # information about the post
    post_info["SUBMISSION_ID"] = post_id
    
    post_info["SUBREDDIT"] = group_name
    post_info["group_id"] = group_id
    title = post["contents"]
    post_info["TITLE"] = title
    
    author = user_id_name_map[post["author"]["$oid"]]
    post_info["AUTHOR"] = author
    post_info["NSFW"] = None
    post_info["Link"] = ""
    
    time = post["updatedAt"]["$date"]
    dt_obj = pd.to_datetime(time, infer_datetime_format=True)
    timestamp = dt_obj.timestamp()
    post_info["timestamp"] = timestamp
    
    if "threadParent" in post:
        threadParent_id = post["threadParent"]["$oid"]
        if "#_COMMENTS" not in posts_info_map[threadParent_id]:
            posts_info_map[threadParent_id]["#_COMMENTS"] = 1
        else:
            posts_info_map[threadParent_id]["#_COMMENTS"] += 1

posts_df = pd.DataFrame.from_records(list(posts_info_map.values()))

In [45]:
posts_users_votes = defaultdict(dict)
for post_id, post_info in posts_info_map.items():
    if "SUBMISSION_ID" not in post_info: continue
    assert post_id == post_info["SUBMISSION_ID"]
    post_timestamp = post_info["timestamp"]
    group_id = post_info["group_id"]
    group_name = post_info["SUBREDDIT"]
    group_members = group_id_members_map[group_id]
    
    users_votes = posts_users_votes[post_id]
    for member_id in group_members:
        if member_id in user_id_name_map:
            users_votes[member_id] = {"SUBMISSION_ID": post_id,	"SUBREDDIT": group_name, "CREATED_TIME": post_timestamp, "USERNAME": user_id_name_map[member_id], "VOTE": "downvote"}

In [50]:
for reaction in reactions:
    post_id = reaction["post"]["$oid"]
    if post_id not in posts_users_votes: continue
    author_id = reaction["author"]["$oid"]
    posts_users_votes[post_id][author_id]["VOTE"] = "upvote"

for echopostproposal in echopostproposals:
    post_id = echopostproposal["echoParent"]["$oid"]
    if post_id not in posts_users_votes: continue
    author_id = echopostproposal["echoAuthor"]["$oid"]
    if author_id not in posts_users_votes[post_id]:
        posts_users_votes[post_id][author_id] = list(posts_users_votes[post_id].values())[0]
    else:
        posts_users_votes[post_id][author_id]["VOTE"] = "upvote"
        
all_votes = []
for post_id, users_votes in posts_users_votes.items():
    for user_id, vote in users_votes.items():
        all_votes.append(vote)
votes_df = pd.DataFrame.from_records(all_votes)

In [54]:
posts_df.to_csv(curio_data_path + "post_info.txt", sep = "\t", index = False)
votes_df.to_csv(curio_data_path + "all_votes.txt", sep = "\t", index = False)