# ⚙️ Install Libraries and Download Dataset
The master's thesis used a much larger dataset of Reddit comments. Here, this is used to reduce the download time. The rest of the procedure is analogous to the master thesis.

In [None]:
!pip install tqdm pandas zstandard transformers evaluate

!wget -O RC_2019-04.zst https://zenodo.org/record/3608135/files/RC_2019-04.zst?download=1

Unpack the file

In [None]:
import zstandard as zstd

your_filename = "..."
with open('RC_2019-04.zst', "rb") as f:
    data = f.read()

dctx = zstd.ZstdDecompressor()
decompressed = dctx.decompress(data)


# 👩‍💻 Extract comments

You can specify the subreddits whose comments should be collected at the bottom of the code

In [None]:
import pandas as pd
import json
import random
from tqdm import tqdm

# set the list of subreddits to filter by
subreddits = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
              "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes",
              "politics",
              "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]

# set the number of comments to randomly select
num_comments = 5000


# define function to extract relevant fields from comment
def extract_fields(comment):
    fields = {}
    fields['subreddit'] = comment['subreddit']
    fields['id'] = comment['id']
    fields['submission_id'] = comment['link_id'].split('_')[1]
    fields['body'] = comment['body']
    fields['created_utc'] = comment['created_utc']
    fields['parent_id'] = comment['parent_id']
    fields['permalink'] = 'https://www.reddit.com' + comment['permalink']
    return fields


subreddit_groups = {}
for subreddit in subreddits:
    subreddit_groups[subreddit] = []

print("About to parse the whole file (this may take a while)")
# open the file and read in the comments
with open('RC_2023-03', 'r') as f:
    for line in tqdm(f):
        comment = json.loads(line)
        if comment['subreddit'] in subreddits:
            subreddit_groups[comment['subreddit']].append(extract_fields(comment))
with open("gathered_comments.json", "w") as r:
    json.dump(subreddit_groups, r)
print("Parsed the whole file")


# 📋 Create a .csv file with a specified number of random comments from the selected subreddits

In [None]:
import json
import random
import pandas as pd
from tqdm import tqdm

subreddits = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
              "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes",
              "politics",
              "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]

gathered_comments_file = open("gathered_comments.json")
gathered_comments = json.load(gathered_comments_file)

num_comments = 3000

selected_comments = []
for subreddit in tqdm(subreddits):
    number_of_comments = len(gathered_comments[subreddit])
    if number_of_comments < num_comments:
        selected_comments.extend(gathered_comments[subreddit])
        continue
    selected_comments.extend(random.sample(gathered_comments[subreddit], num_comments))

print("Convert the list to a data frame")
# create a DataFrame from the selected comments
df = pd.DataFrame(selected_comments)
print("List converted")

# save the DataFrame to a CSV file
df.to_csv('selected_comments.csv', index=False)
print("file saved")

# 🧑‍🔬 Filter data

In [None]:
import re
import pandas
from tqdm import tqdm
from transformers import AutoTokenizer

result_file_name = 'filtered_comments.csv'

min_length = 10  # token
max_length = 512  # token

data = pandas.io.parsers.read_csv("selected_comments.cs")

url_pattern = re.compile(r'https?:\/\/\S+')

tokenizer = AutoTokenizer.from_pretrained('gpt2')

filtered_data = []
for index, row in tqdm(data.iterrows(), total=data.shape[0], desc='Filter Comments'):
    comment = row.body
    # remove all whitespace characters and line breaks and other unnecessary characters
    comment = comment.replace('\n', ' ').replace('\\n', ' ')
    if comment.startswith('>'):
        comment = comment.replace('>', '')
    comment = ' '.join(comment.split())

    # check length
    tokenized_comment = tokenizer.tokenize(comment)
    if len(tokenized_comment) > max_length or len(tokenized_comment) < min_length:
        continue
    # check content
    if comment == "[removed]" or comment == "[deleted]":
        continue
    # remove comments containing urls
    if url_pattern.search(comment):
        continue
    filtered_data.append({'subreddit': row.subreddit, 'id': row.id, 'submission_id': row.submission_id,
                          'body': comment, 'created_utc': row.created_utc, 'parent_id': row.parent_id,
                          'permalink': row.permalink})

filtered_data_df = pandas.DataFrame(filtered_data)
filtered_data_df.to_csv(result_file_name, sep=",", encoding="utf-8")


# 🧑‍🏭 Add perplexities to the filtered data

In [None]:
import pandas
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer

result_file_name = 'filtered_comments_with_ppl.csv'

data = pandas.io.parsers.read_csv("filtered_comments.csv")

perplexity_model = evaluate.load("perplexity", module_type="measurement")
tokenizer = AutoTokenizer.from_pretrained('gpt2')

comment_objects = []
prepared_comments = []
for index, row in tqdm(data.iterrows(), total=data.shape[0], desc='Filter Comments'):
    comment = row.body
    token_size = len(tokenizer.tokenize(comment))
    if token_size > 1024 and tokenizer < 10:
        continue
    comment_objects.append({'subreddit': row.subreddit, 'id': row.id, 'submission_id': row.submission_id,
                            'body': comment, 'created_utc': row.created_utc, 'parent_id': row.parent_id,
                            'permalink': row.permalink, 'token_size': token_size})
    prepared_comments.append(comment)

perplexity_results = perplexity_model.compute(data=prepared_comments, model_id='gpt2')
print(perplexity_results)


filtered_data = []
for i in range(len(perplexity_results['perplexities'])):
    comment_objects[i]['perplexity'] = perplexity_results['perplexities'][i]
    filtered_data.append(comment_objects[i])

filtered_data_df = pandas.DataFrame(filtered_data)
filtered_data_df.to_csv(result_file_name, sep=",", encoding="utf-8")
