In [1]:
import nltk
from convokit import Corpus, download
import os
import json
import re
from collections import defaultdict
import csv



TransformerDecoderModel requires ML dependencies. Run 'pip install convokit[llm]' to install them.
UnslothUtteranceSimulatorModel requires ML dependencies. Run 'pip install convokit[llm]' to install them.


In [2]:
path = "C:/Users/hofin/.convokit/saved-corpora/subreddit-changemyview/utterances.jsonl"

othering_words = [
    # Dehumanizing terms (animals, pests, disease metaphors)
    "animal", "beast", "savage", "barbaric", "subhuman", "primitive",
    "parasite", "vermin", "rat", "cockroach", "monster", "brute",
    "ape", "gorilla", "monkey", "dog", "pig", "swine", "goat",
    "bug", "leech", "tick", "lice", "maggot", "worm",
    "cancer", "tumor", "virus", "plague", "infection", "disease",

    # Moral judgment / worthlessness
    "scum", "trash", "filth", "worthless", "inferior", "degenerate",
    "lowlife", "unworthy", "unclean", "impure", "corrupt", "dirty",
    "disgusting", "vile", "evil", "wicked", "sinful", "cursed",
    "burden", "freeloader", "sponger",

    # Threatening / dangerous
    "dangerous", "violent", "aggressive", "hostile", "criminal", "thug",
    "deviant", "radical", "extremist", "fundamentalist", "terrorist",
    "predator", "rapist", "gangster", "invader", "occupier", "conqueror",
    "infiltrator", "threat",

    # Exclusion / separation
    "outsider", "intruder", "alien", "stranger", "foreigner", "enemy",
    "unwelcome", "illegal", "unwanted", "expat", "outcast", "undesirable",
    "colonizer", "settler",

    # Diminishing / infantilizing
    "ignorant", "backward", "naive", "uncivilized", "helpless",
    "weak", "stupid", "lazy", "immature", "childlike", "silly",
    "emotional", "hysterical", "irrational", "clueless",
    "brainwashed", "sheep", "puppet", "follower"
]

othering_groups = [
    # Religion & secular identities
    "atheists", "non-believers", "secular people",
    "buddhists", "buddhist people",
    "hindus", "hindu people",
    "christians", "christian people", "catholics", "protestants",
    "mormons", "evangelicals", "pagans", "satanists",
    "muslims", "islamic people", "islamists", "muzzies", "ragheads",
    "jews", "jewish people", "zionists", "orthodox jews",

    # Migration / nationality
    "immigrants", "migrants", "foreigners", "outsiders", "refugees",
    "asylum seekers", "expats", "nationals", "illegal aliens", "illegals",
    "invaders", "colonizers", "settlers",

    # Gender & women
    "women", "girls", "females", "ladies", "wives", "mothers",
    "bitches", "sluts", "whores", "feminists", "feminazis",

    # LGBTQ+
    "lgbtq", "gay", "gays", "lesbian", "lesbians",
    "bisexual", "transgender", "trans", "tranny", "trannies",
    "queer", "queers", "dyke", "dykes", "faggot", "faggots",
    "non-binary", "drag queens", "drag kings"
]

othering_pattern = re.compile(r"\b(" + "|".join(map(re.escape, othering_words)) + r")\b", re.IGNORECASE)
othering_group_pattern = re.compile(r"\b(" + "|".join(map(re.escape, othering_groups)) + r")\b", re.IGNORECASE)

othering_sentences = []

In [4]:
def filter_and_write_to_csv(corpus, output_file_path, mode="a"):
    convos = defaultdict(list)

    # Step 1: keep only utterances with group mentions
    group_filtered = [
        utt for utt in corpus.iter_utterances()
        if othering_group_pattern.search(str(utt))
    ]

    # Step 2: from those, keep only the ones with othering words
    filtered_lines = [
        utt for utt in group_filtered
        if othering_pattern.search(str(utt))
    ]

    # Step 3: write to CSV
    with open(output_file_path, mode, encoding="utf-8", newline="") as f:
        writer = csv.writer(f)

        # If writing a new file, write the header
        if mode == "w":
            writer.writerow(["id", "root", "reply_to", "timestamp", "text"])

        for utt in filtered_lines:
            writer.writerow([
                getattr(utt, "id", ""),
                getattr(utt, "root", ""),
                getattr(utt, "reply_to", ""),
                getattr(utt, "timestamp", ""),
                str(getattr(utt, "text", "")).replace("\n", " ")
            ])

In [5]:
def filter_groups_to_csv(corpus, output_file_path, mode="a"):

    # Step 1: keep only utterances with group mentions
    group_filtered = [
        utt
        for utt in corpus.iter_utterances()
        if othering_group_pattern.search(str(utt))
        and len(nltk.tokenize.word_tokenize(utt.text)) <= 100
    ]

    with open(output_file_path, mode, encoding="utf-8", newline="") as f:
        writer = csv.writer(f, delimiter=",")

        # If writing a new file, write the header
        if mode == "w":
            writer.writerow(["id", "root", "reply_to", "timestamp", "text"])

        for utt in group_filtered:
            writer.writerow(
                [
                    getattr(utt, "id", ""),
                    getattr(utt, "root", ""),
                    getattr(utt, "reply_to", ""),
                    getattr(utt, "timestamp", ""),
                    str(getattr(utt, "text", "")).replace("\n", " "),
                ]
            )

In [3]:
def load_dataset_dynamic(corpus, start_index, end_index):
    return Corpus(
        filename=download(corpus),
        backend="mem",
        utterance_start_index=start_index,
        utterance_end_index=end_index
    )


In [None]:
i = 50000
j = 60000
output_file = "C:/Users/hofin/OneDrive - Fachhochschule St. Pölten/__BachelorThesis/4chan_final_filtered.txt"

while j <= 100000:

    corpus1 = load_dataset_dynamic("subreddit-4chan", i, j)
    filter_and_write_to_csv(corpus1, output_file, mode="a")

    i = j + 1
    j += 10000

Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan


In [None]:
from collections import defaultdict

corpus1 = load_dataset_dynamic("subreddit-changemyview", 8000, 9000)

convos = defaultdict(list)

# Step 1: group all utterances by conversation
for utt in corpus1.iter_utterances():
    convos[utt.conversation_id].append(utt.id + " " + utt.text)

# Step 2: print conversations
for convo_id, texts in convos.items():
    print(f"=== Conversation {convo_id} ===")  # header once
    for line in texts:
        print(line)  # all lines in that conversation
    print()  # blank line between conversations


Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview


AttributeError: 'ConvoKitMeta' object has no attribute 'permalink'

In [23]:
# Load dataset
corpus1 = load_dataset_dynamic("subreddit-changemyview", 200000, 500000)

# Conversation ID you want
target_id = "1edyfk"

# Iterate over all utterances and print those that belong to the target conversation
print(f"=== Conversation {target_id} ===")
for utt in corpus1.iter_utterances():
    if utt.conversation_id == target_id:
        print(utt)


Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
=== Conversation 1edyfk ===
Utterance(id: 'c9z9s3n', conversation_id: 1edyfk, reply-to: 1edyfk, speaker: Speaker(id: 'SilkyTheCat', vectors: [], meta: ConvoKitMeta({})), timestamp: 1368632408, text: "It seems as if your contention amounts to the claim that there is a real need for men's help centers in some places and of some sort.  Just to be clear, is this what your view amounts to?  It seems as if the rest of your view hinges on this premise, and that recognizing that this premise is true will validate the rest of your view.", vectors: [], meta: ConvoKitMeta({'score': 25, 'top_level_comment': 'c9z9s3n', 'retrieved_on': 1431149501, 'gilded': 0, 'gildings': None, 'subreddit': 'changemyview', 'stickied': False, 'permalink': '', 'author_flair_text': '5∆'}))
Utterance(id: 'c9z9yen', conversation_id: 1edyfk, reply-to: 1edyfk, speaker: Speaker(id: 'Froolow', vectors: [], meta: ConvoKitMeta({})), timesta

In [None]:
corpus_small = Corpus(download("reddit-corpus-small"))
corpus_small.print_summary_stats()

In [41]:
corpus_incels = Corpus(download("subreddit-Incels4Life"))
corpus_incels.print_summary_stats()

Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-Incels4Life
Number of Speakers: 495
Number of Utterances: 2736
Number of Conversations: 389


In [3]:
corpus_4chan4trump = Corpus(download("subreddit-4chan4trump"))
corpus_4chan4trump.print_summary_stats()

Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan4trump
Number of Speakers: 174
Number of Utterances: 914595
Number of Conversations: 17703


In [None]:
corpus1 = Corpus(filename=download("subreddit-changemyview"), backend="mem", utterance_start_index=0, utterance_end_index=10000)
corpus1.print_summary_stats()

Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Number of Speakers: 5523
Number of Utterances: 10001
Number of Conversations: 10001


In [None]:
#executed already
i = 1900001
j = 1910000
output_file = "C:/Users/hofin/OneDrive - Fachhochschule St. Pölten/__BachelorThesis/changemyview.txt"

while j <= 2000000:

    corpus1 = load_dataset_dynamic("subreddit-changemyview", i, j)
    filter_and_write_to_file(corpus1, output_file, mode="a")

    i = j + 1
    j += 10000

Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-changemyview


In [None]:
#executed already
i = 0
j = 100000
output_file = "C:/Users/hofin/OneDrive - Fachhochschule St. Pölten/__BachelorThesis/4chan_big.txt"

while j <= 5000000:

    corpus1 = load_dataset_dynamic("subreddit-4chan", i, j)
    filter_and_write_to_file(corpus1, output_file, mode="a")

    i = j + 1
    j += 10000

Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C:\Users\hofin\.convokit\saved-corpora\subreddit-4chan
Dataset already exists at C: