In [55]:
import pandas as pd
import itertools
import json

In [56]:
def read_n_lines(file_path, n_lines=1000, columns=None):
    """
    Read the first n lines from a JSONL file into a pandas DataFrame,
    optionally selecting only specific columns.

    Parameters:
    file_path (str): Path to the JSONL file
    n_lines (int): Number of lines to read (default: 1000)
    columns (list): List of column names to include (default: None, includes all columns)

    Returns:
    pandas.DataFrame: DataFrame containing the first n lines of JSONL data
    """
    # Initialize list to store the selected data
    selected_data = []

    with open(file_path, "r") as file:
        # Get first n lines
        for line in itertools.islice(file, n_lines):
            # Parse each line as JSON
            full_record = json.loads(line)

            if columns:
                # Keep only the specified columns
                selected_record = {col: full_record.get(col) for col in columns}
                selected_data.append(selected_record)
            else:
                selected_data.append(full_record)

    # Create DataFrame from the selected data
    df = pd.DataFrame(selected_data)

    return df

In [57]:
file_path_posts = "RS_2020-05"
file_path_comments = "RC_2020-05"
posts = read_n_lines(file_path_posts, n_lines=10000)
comments = read_n_lines(file_path_comments, n_lines=10000)

In [58]:
print(list(posts.columns))
print(list(comments.columns))

['all_awardings', 'allow_live_comments', 'archived', 'author', 'author_created_utc', 'author_flair_background_color', 'author_flair_css_class', 'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 'author_flair_type', 'author_fullname', 'author_patreon_flair', 'author_premium', 'can_gild', 'category', 'content_categories', 'contest_mode', 'created_utc', 'discussion_type', 'distinguished', 'domain', 'edited', 'gilded', 'gildings', 'hidden', 'hide_score', 'id', 'is_created_from_ads_ui', 'is_crosspostable', 'is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'link_flair_background_color', 'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id', 'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked', 'media', 'media_embed', 'media_only', 'name', 'no_follow', 'num_comments', 'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'post_

## Explore post annotations

In [59]:
posts.head(10)

Unnamed: 0,all_awardings,allow_live_comments,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,promoted_display_name,promoted_url,show_media,sk_ad_network_data,third_party_trackers,third_party_tracking,third_party_tracking_2,author_cakeday,poll_data,rpan_video
0,[],True,True,qoupgod,1561421000.0,,,[],,,...,,,,,,,,,,
1,[],False,True,[deleted],,,,,,,...,,,,,,,,,,
2,[],False,True,Cowtipper105,1488652000.0,,,[],,,...,,,,,,,,,,
3,[],False,True,[deleted],,,,,,,...,,,,,,,,,,
4,[],False,True,Takainvancouver,1588290000.0,,,[],,,...,,,,,,,,,,
5,[],False,True,ezhun,1588291000.0,,,[],,,...,,,,,,,,,,
6,[],False,True,AnimeSubin,1582084000.0,#ddbd37,,"[{'e': 'text', 't': '{Moderator 🛡️}'}]",ccb986ba-606a-11e9-89fb-0e7a9c6b88d2,{Moderator 🛡️},...,,,,,,,,,,
7,[],False,True,20n20discounted,1588114000.0,,,[],,,...,,,,,,,,,,
8,[],True,True,sbags,1385618000.0,transparent,monkey,"[{'a': ':monky:', 'e': 'emoji', 'u': 'https://...",c2680206-2489-11eb-a70b-0e59fcb0359b,:monky:,...,,,,,,,,,,
9,[],False,True,[deleted],,,,,,,...,,,,,,,,,,


In [77]:
posts["selftext"].value_counts()

selftext
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [61]:
# Look at archived percentage
archived_percentage = posts["archived"].value_counts(normalize=True) * 100
print(archived_percentage)

archived
True     99.95
False     0.05
Name: proportion, dtype: float64


## Explore comment annotations

In [62]:
comments.head(10)

Unnamed: 0,all_awardings,associated_award,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,send_replies,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,top_awarded_type,total_awards_received,treatment_tags,author_cakeday
0,[],,littlewillingness,1531915000.0,,,[],,,,...,True,False,ACIM,t5_2s5jj,r/ACIM,public,,0,[],
1,[],,paidinteeth,1320182000.0,,,[],,,,...,True,False,videos,t5_2qh1e,r/videos,public,,0,[],
2,[],,wild-ish,,,,[],,,,...,True,False,step1,t5_37lrc,r/step1,public,,0,[],
3,[],,ergoergoergoergo,1544712000.0,,,[],,,,...,True,False,centrist,t5_2qx8j,r/centrist,public,,0,[],
4,[],,teenaamariee,1439857000.0,,,[],,http://www.wlsearch.com/u/teenaamariee,dark,...,True,False,Wishlist,t5_2qpbu,r/Wishlist,public,,0,[],
5,[],,cheucklate,1456717000.0,,,[],,,,...,True,False,smallbusiness,t5_2qr34,r/smallbusiness,public,,0,[],
6,[],,steatorrhoea,,,,[],,,,...,True,False,wallstreetbets,t5_2th52,r/wallstreetbets,public,,0,[],
7,[],,OldGrayMare59,1492711000.0,,,[],,,,...,True,False,Wellthatsucks,t5_2xcv7,r/Wellthatsucks,public,,0,[],
8,[],,Dub-Town-Warrior,,,,[],,,,...,True,False,memes,t5_2qjpg,r/memes,public,,0,[],
9,[],,twobadkidsin412,1466547000.0,,,[],,,,...,True,False,selfie,t5_2w2f5,r/selfie,public,,0,[],


In [63]:
comments["body"].value_counts()

body
[deleted]                                                                                                                                                                                                                                 433
[removed]                                                                                                                                                                                                                                 353
W                                                                                                                                                                                                                                          49
Nice                                                                                                                                                                                                                                       16
Thank you!                                 

In [64]:
comments["author"].value_counts()

author
[deleted]              1001
AutoModerator           162
TrueIndologyBot          42
Aveexi                   27
grandwizardchatngga      16
                       ... 
IrNinjaBob                1
roguerenegade24           1
095805                    1
VikramV17                 1
TonyTurnUp                1
Name: count, Length: 7982, dtype: int64

In [65]:
comments["controversiality"].value_counts()

controversiality
0    9824
1     176
Name: count, dtype: int64

In [66]:
posts = read_n_lines(
    file_path_posts,
    n_lines=10000,
    columns=[
        "created_utc",
        "id",
        "name",
        "title",
        "selftext",
        "subreddit",
        "score",
        "upvote_ratio",
        "num_comments",
        "archived",
        "author",
        "distinguished",
        "media",
    ],
)
comments = read_n_lines(
    file_path_comments,
    n_lines=10000,
    columns=[
        "created_utc",
        "id",
        "parent_id",
        "body",
        "score",
        "controversiality",
        "author",
    ],
)

In [67]:
posts.head()

Unnamed: 0,created_utc,id,name,title,selftext,subreddit,score,upvote_ratio,num_comments,archived,author,distinguished,media
0,1588291200,gb7h1s,t3_gb7h1s,While I was fiending for more gallery dept rep...,[https://item.taobao.com/item.htm?spm=a230r.1...,QualityReps,3,0.67,11,True,qoupgod,,
1,1588291200,gb7h1t,t3_gb7h1t,Why is my tiny pug so angry?,[removed],pugs,1,1.0,0,True,[deleted],,
2,1588291200,gb7h1u,t3_gb7h1u,3 RO membranes??,"Sooo I accidentally order an extra membrane, I...",ReefTank,3,0.8,3,True,Cowtipper105,,
3,1588291200,gb7h1v,t3_gb7h1v,alopecidente,[deleted],LMDShow,1,0.99,0,True,[deleted],,
4,1588291200,gb7h1w,t3_gb7h1w,GST for restaurant food in Vancouver BC,[removed],doordash,1,1.0,0,True,Takainvancouver,,


In [68]:
comments.head()

Unnamed: 0,created_utc,id,parent_id,body,score,controversiality,author
0,1588291200,fp43xqf,t3_gaxv94,Theology can be a contentious topic. How much...,5,0,littlewillingness
1,1588291200,fp43xqi,t1_fp40jkl,"You know, it’s weird you say this, because I h...",2,0,paidinteeth
2,1588291200,fp43xql,t1_fp415ps,I'm reading so many threads in how the student...,1,0,wild-ish
3,1588291200,fp43xqm,t1_fp43ag2,no. what im saying is we arent saving anyone. ...,1,0,ergoergoergoergo
4,1588291200,fp43xqn,t1_fp3oud3,Travel the world! I'd be able to cross a few t...,2,0,teenaamariee


In [69]:
comments["post_id"] = comments["parent_id"].str[3:]
comments["prefix"] = comments["parent_id"].str[:3]

In [70]:
merged = pd.merge(
    posts,
    comments,
    left_on="name",
    right_on="parent_id",
    suffixes=("_post", "_comment"),
)

In [71]:
len(merged)

386

In [72]:
merged = pd.merge(
    posts, comments, left_on="id", right_on="post_id", suffixes=("_post", "_comment")
)

In [73]:
len(merged)

386

In [74]:
merged["id_post"].value_counts()

id_post
gb7h2e    39
gb7hs7     8
gb7hdv     7
gb7h5c     7
gb7han     6
          ..
gb7hha     1
gb7hhm     1
gb7hhn     1
gb7hht     1
gb7hf9     1
Name: count, Length: 272, dtype: int64