# ⚙️ Install Libraries and Download Dataset
The master's thesis used a much larger dataset of Reddit comments. Here, this is used to reduce the download time. The rest of the procedure is analogous to the master thesis.

In [13]:
!pip install tqdm pandas zstandard

!wget -O RC_2019-04.zst https://zenodo.org/record/3608135/files/RC_2019-04.zst?download=1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2023-06-17 20:54:25--  https://zenodo.org/record/3608135/files/RC_2019-04.zst?download=1
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15531201485 (14G) [application/octet-stream]
Saving to: ‘RC_2019-04.zst’


2023-06-17 21:15:48 (11.6 MB/s) - ‘RC_2019-04.zst’ saved [15531201485/15531201485]



Unpack the file

In [None]:
import zstandard as zstd

your_filename = "..."
with open('RC_2019-04.zst', "rb") as f:
    data = f.read()

dctx = zstd.ZstdDecompressor()
decompressed = dctx.decompress(data)


# 👩‍💻 Extract comments

You can specify the subreddits whose comments should be collected at the bottom of the code

In [None]:
import pandas as pd
import json
import random
from tqdm import tqdm

# set the list of subreddits to filter by
subreddits = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
              "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes",
              "politics",
              "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]

# set the number of comments to randomly select
num_comments = 5000


# define function to extract relevant fields from comment
def extract_fields(comment):
    fields = {}
    fields['subreddit'] = comment['subreddit']
    fields['id'] = comment['id']
    fields['submission_id'] = comment['link_id'].split('_')[1]
    fields['body'] = comment['body']
    fields['created_utc'] = comment['created_utc']
    fields['parent_id'] = comment['parent_id']
    fields['permalink'] = 'https://www.reddit.com' + comment['permalink']
    return fields


subreddit_groups = {}
for subreddit in subreddits:
    subreddit_groups[subreddit] = []

print("About to parse the whole file (this may take a while)")
# open the file and read in the comments
with open('RC_2023-03', 'r') as f:
    for line in tqdm(f):
        comment = json.loads(line)
        if comment['subreddit'] in subreddits:
            subreddit_groups[comment['subreddit']].append(extract_fields(comment))
with open("gathered_comments.json", "w") as r:
    json.dump(subreddit_groups, r)
print("Parsed the whole file")


# 📋 Create a .csv file with a specified number of random comments from the selected subreddits

In [None]:
import json
import random
import pandas as pd
from tqdm import tqdm

subreddits = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
              "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes",
              "politics",
              "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]

gathered_comments_file = open("gathered_comments.json")
gathered_comments = json.load(gathered_comments_file)

num_comments = 3000

selected_comments = []
for subreddit in tqdm(subreddits):
    number_of_comments = len(gathered_comments[subreddit])
    if number_of_comments < num_comments:
        selected_comments.extend(gathered_comments[subreddit])
        continue
    selected_comments.extend(random.sample(gathered_comments[subreddit], num_comments))

print("Convert the list to a data frame")
# create a DataFrame from the selected comments
df = pd.DataFrame(selected_comments)
print("List converted")

# save the DataFrame to a CSV file
df.to_csv('selected_comments.csv', index=False)
print("file saved")