**Data Retrieval, Sampling and Incivility scores (PerspectiveAPI)**


**Content**

For each Subreddit:
1. Functions for generating .csv from .zst (Reddit Dump format)

2. Sampling 19 Comments for each month (from 2012 until 2024)

3. Calling PerspectiveAPI to label each comment and saving to {subreddit}_with_scores.csv for next step

In [None]:

import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers
import pandas as pd
from googleapiclient import discovery # need to pip install too
import time
from tqdm import tqdm


In [None]:
# generating csv out of Reddit dump / comments zst to csv
# EXAMPLE file_path = "liberal_comments.zst"
# EXAMPLE output_file = "liberal_comments.csv"
def generate_csv_from_dump(file_path, output_file):


    comment_fields = [
        "id", "parent_id", "link_id", "author", "author_flair_text", "body",
        "created_utc", "score", "controversiality", "distinguished", "edited",
        "gilded", "is_submitter", "stickied", "subreddit", "subreddit_id"
    ]


    lines_processed = 0
    bad_lines = 0
    buffer = ""

    with open(file_path, "rb") as f_in, open(output_file, "w", newline="", encoding="utf-8") as f_out:
        reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(f_in)
        writer = csv.DictWriter(f_out, fieldnames=comment_fields)
        writer.writeheader()

        while True:
            chunk = reader.read(2**25)
            if not chunk:
                break
            chunk = chunk.decode(errors='replace')
            lines = (buffer + chunk).split("\n")

            for line in lines[:-1]:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                    row = {k: obj.get(k, "") for k in comment_fields}  # fehlende Keys als ""
                    writer.writerow(row)
                    lines_processed += 1
                except json.JSONDecodeError:
                    bad_lines += 1

            buffer = lines[-1]

    print(f"Fertig! {lines_processed} Zeilen verarbeitet, {bad_lines} fehlerhafte JSON-Zeilen.")
    print(f"Daten gespeichert in: {output_file}")



In [None]:
# First preprocessing step: take 100 comments per year.
# oldest comment: 2009-05-03, newest comment: 2024-12-31
# we can start from 2012 - complete until 2024



def Extract_Monthly_samples(df, Subreddit):
    data = df.copy()
    data["year"] = pd.to_datetime(data["created_utc"], unit="s").dt.year
    data["month"] = pd.to_datetime(data["created_utc"], unit="s").dt.month

    # no empty or deleted samples
    mask = (
        data["body"].notna()
        & ~data["body"].isin(["[deleted]", "[removed]", ""])
    )
    data = data[mask]

    # Filter years
    sampled = data[(data["year"] > 2011) & (data["year"] < 2025)]# Sample 19 per (year, month)
    
    sampled_csv = (
        sampled
        .groupby(["year", "month"], group_keys=False)
        .apply(lambda g: g.sample(n=min(len(g), 19), random_state=42))
    )
    sampled_csv.to_csv(f"{Subreddit}_sampled.csv", index=False)
    
    return sampled_csv



In [None]:


API_KEY = ''

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

def Generate_Incivility_Ratings(data, Subreddit):
# initialise new column
    data['incivility_score'] = 0.0 

    for idx, row in tqdm(data.iterrows(), total=len(data)):
        comment = row['body']

       # checking for empty comments
        if not comment or comment.strip() == "":
            continue
        if row['incivility_score'] != 0: 
            continue

        analyze_request = {
            'comment': {'text': comment},
            'requestedAttributes': {'TOXICITY': {}},
            'languages': ['en']   
        }

        try:
            response = client.comments().analyze(body=analyze_request).execute()
            data.at[idx, 'incivility_score'] = response["attributeScores"]["TOXICITY"]["summaryScore"]["value"]
        except Exception as e:
            print(f"Error at index {idx}: {e}")

        if idx % 250 == 0 and idx > 0:
            data.to_csv(f"{Subreddit}_with_scores.csv", index=False)
            print(f"Zwischenspeicherung bei Index {idx}")

        time.sleep(.8) # rate is 1 request per second

    data.to_csv(f"{Subreddit}_with_scores.csv", index=False)

In [None]:
# Starting with Liberal
file_path = "Liberal_comments.zst"
output_file = "Liberal_comments.csv"

generate_csv_from_dump(file_path, output_file)



Fertig! 497079 Zeilen verarbeitet, 0 fehlerhafte JSON-Zeilen.
Daten gespeichert in: C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\Liberal_comments.csv


In [None]:
output_file = "Liberal_comments.csv"
df = pd.read_csv(output_file)
Subreddit = "Liberal"
samples_Liberal = Extract_Monthly_samples(df, Subreddit)


  df = pd.read_csv(output_file)
  .apply(lambda g: g.sample(n=min(len(g), 19), random_state=42))


In [None]:
# testing with smaller sample size 
test = samples_Liberal.sample(n=40, random_state=42) 
Generate_Incivility_Ratings(test, Subreddit)

100%|██████████| 40/40 [00:47<00:00,  1.19s/it]


In [3]:
samples_Liberal = pd.read_csv(r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\Liberal_with_scores.csv")
Subreddit = "Liberal"
Generate_Incivility_Ratings(samples_Liberal, Subreddit)

100%|██████████| 2964/2964 [00:00<00:00, 24093.10it/s]


2. Import And Rate Conservative Subreddit Data


In [None]:
# file_path = "Conservative_comments.zst"
# output_file = "Conservative_comments.csv"

# generate_csv_from_dump(file_path, output_file)
# # took 8 minutes 

Fertig! 18984143 Zeilen verarbeitet, 0 fehlerhafte JSON-Zeilen.
Daten gespeichert in: C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\Conservative_comments.csv


In [None]:
output_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\Conservative_comments.csv"
df = pd.read_csv(output_file)
Subreddit = "Conservative"
samples_Conservative = Extract_Monthly_samples(df, Subreddit)
# took 8 minutes


In [None]:
# # testing with smaller sample size 
# test = samples_Conservative.sample(n=40, random_state=42) 
# Generate_Incivility_Ratings(test, Subreddit)

100%|██████████| 40/40 [00:48<00:00,  1.20s/it]


In [None]:
# # test worked
# test_csv = pd.read_csv(r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\Conservative_with_scores.csv")

In [4]:
# test worked
samples_Conservative = pd.read_csv(r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\Conservative_with_scores.csv")
Subreddit = "Conservative"

In [5]:
Generate_Incivility_Ratings(samples_Conservative, Subreddit)
# done after 5h 43

100%|██████████| 2964/2964 [08:44<00:00,  5.65it/s]


3. Import and Rating of r/funny posts n = 3420 (originally, before settling on 2012 -2024 )

In [12]:
# r /funny
file_path = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\funny_comments.zst"
output_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\funny_comments.csv"

generate_csv_from_dump(file_path, output_file)

# took 35 min

Fertig! 117525270 Zeilen verarbeitet, 0 fehlerhafte JSON-Zeilen.
Daten gespeichert in: C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\funny_comments.csv


In [13]:
import pandas as pd
import random
from collections import defaultdict
from datetime import datetime

# Parameter
input_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\funny_comments.csv"
output_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\funny_sampled.csv"
target_subreddit = "funny"
samples_per_month = 19
chunksize = 10**6

cols = ["id", "parent_id", "link_id", "author", "body", "created_utc", "score", "subreddit"]
dtypes = {
    "id": "string", "parent_id": "string", "link_id": "string",
    "author": "string", "body": "string", "created_utc": "int64",
    "score": "int32", "subreddit": "string"
}

reservoirs = defaultdict(list)
counts = defaultdict(int)
start_year, end_year = 2012, 2024

chunk_num = 0
for chunk in pd.read_csv(input_file, usecols=cols, dtype=dtypes, chunksize=chunksize):
    chunk_num += 1
    subset = chunk[chunk["subreddit"] == target_subreddit]
    created_dt = pd.to_datetime(subset["created_utc"], unit='s')
    months = created_dt.dt.strftime("%Y-%m")
    
    for row, month in zip(subset.itertuples(index=False), months):
        if pd.isna(row.body) or row.body in ("[deleted]", "[removed]", ""):
            continue
        year = int(month[:4])
        if start_year <= year <= end_year:
            counts[month] += 1
            idx = counts[month]
            if len(reservoirs[month]) < samples_per_month:
                reservoirs[month].append(row)
            else:
                j = random.randint(0, idx - 1)
                if j < samples_per_month:
                    reservoirs[month][j] = row

    if chunk_num % 1 == 0:
        total_samples = sum(len(v) for v in reservoirs.values())
        print(f"Chunk {chunk_num}: total samples so far = {total_samples}")

# Flatten reservoirs
final_samples = []
for month, rows in sorted(reservoirs.items()):
    for row in rows:
        r = row._asdict()
        r["month"] = month
        final_samples.append(r)

df_samples = pd.DataFrame(final_samples)
df_samples.to_csv(output_file, index=False)
print("Fertig! Samples gespeichert in:", output_file)


Chunk 1: total samples so far = 0
Chunk 2: total samples so far = 0
Chunk 3: total samples so far = 0
Chunk 4: total samples so far = 0
Chunk 5: total samples so far = 0
Chunk 6: total samples so far = 0
Chunk 7: total samples so far = 0
Chunk 8: total samples so far = 38
Chunk 9: total samples so far = 57
Chunk 10: total samples so far = 76
Chunk 11: total samples so far = 95
Chunk 12: total samples so far = 114
Chunk 13: total samples so far = 133
Chunk 14: total samples so far = 133
Chunk 15: total samples so far = 152
Chunk 16: total samples so far = 171
Chunk 17: total samples so far = 190
Chunk 18: total samples so far = 209
Chunk 19: total samples so far = 228
Chunk 20: total samples so far = 228
Chunk 21: total samples so far = 247
Chunk 22: total samples so far = 266
Chunk 23: total samples so far = 285
Chunk 24: total samples so far = 285
Chunk 25: total samples so far = 304
Chunk 26: total samples so far = 323
Chunk 27: total samples so far = 342
Chunk 28: total samples so f

In [6]:
# # testing with smaller sample size 
output_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\funny_with_scores.csv"
Subreddit = "funny"
samples_funny = pd.read_csv(output_file)
# test = samples_funny.sample(n=40, random_state=42) 
# Generate_Incivility_Ratings(test, Subreddit)

In [7]:

Generate_Incivility_Ratings(samples_funny, Subreddit)

 64%|██████▍   | 1897/2964 [00:08<00:11, 95.98it/s]  

Zwischenspeicherung bei Index 2000


100%|██████████| 2964/2964 [00:16<00:00, 175.18it/s]


4. Import and rating of r/ politics n = 3420


In [None]:
# # r/ politics
# file_path = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\politics_comments.zst"
# output_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\politics_comments.csv"

# generate_csv_from_dump(file_path, output_file)

# # took 94 min

Fertig! 201460965 Zeilen verarbeitet, 0 fehlerhafte JSON-Zeilen.
Daten gespeichert in: C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\politics_comments.csv


In [20]:
import pandas as pd
import random
from collections import defaultdict
from datetime import datetime

# Parameter
input_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\reddit\subreddits24\politics_comments.csv"
output_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\politics_sampled.csv"
target_subreddit = "politics"
samples_per_month = 19
chunksize = 10**6

cols = ["id", "parent_id", "link_id", "author", "body", "created_utc", "score", "subreddit"]
dtypes = {
    "id": "string", "parent_id": "string", "link_id": "string",
    "author": "string", "body": "string", "created_utc": "int64",
    "score": "int32", "subreddit": "string"
}

reservoirs = defaultdict(list)
counts = defaultdict(int)
start_year, end_year = 2012, 2024

chunk_num = 0
for chunk in pd.read_csv(input_file, usecols=cols, dtype=dtypes, chunksize=chunksize):
    chunk_num += 1
    subset = chunk[chunk["subreddit"] == target_subreddit]
    created_dt = pd.to_datetime(subset["created_utc"], unit='s')
    months = created_dt.dt.strftime("%Y-%m")
    
    for row, month in zip(subset.itertuples(index=False), months):
        if pd.isna(row.body) or row.body in ("[deleted]", "[removed]", ""):
            continue
        year = int(month[:4])
        if start_year <= year <= end_year:
            counts[month] += 1
            idx = counts[month]
            if len(reservoirs[month]) < samples_per_month:
                reservoirs[month].append(row)
            else:
                j = random.randint(0, idx - 1)
                if j < samples_per_month:
                    reservoirs[month][j] = row

    if chunk_num % 1 == 0:
        total_samples = sum(len(v) for v in reservoirs.values())
        print(f"Chunk {chunk_num}: total samples so far = {total_samples}")

# Flatten reservoirs
final_samples = []
for month, rows in sorted(reservoirs.items()):
    for row in rows:
        r = row._asdict()
        r["month"] = month
        final_samples.append(r)

df_samples = pd.DataFrame(final_samples)
df_samples.to_csv(output_file, index=False)
print("Fertig! Samples gespeichert in:", output_file)


Chunk 1: total samples so far = 0
Chunk 2: total samples so far = 0
Chunk 3: total samples so far = 0
Chunk 4: total samples so far = 0
Chunk 5: total samples so far = 0
Chunk 6: total samples so far = 0
Chunk 7: total samples so far = 0
Chunk 8: total samples so far = 0
Chunk 9: total samples so far = 19
Chunk 10: total samples so far = 57
Chunk 11: total samples so far = 114
Chunk 12: total samples so far = 152
Chunk 13: total samples so far = 190
Chunk 14: total samples so far = 209
Chunk 15: total samples so far = 247
Chunk 16: total samples so far = 304
Chunk 17: total samples so far = 342
Chunk 18: total samples so far = 399
Chunk 19: total samples so far = 456
Chunk 20: total samples so far = 532
Chunk 21: total samples so far = 627
Chunk 22: total samples so far = 703
Chunk 23: total samples so far = 798
Chunk 24: total samples so far = 855
Chunk 25: total samples so far = 893
Chunk 26: total samples so far = 912
Chunk 27: total samples so far = 950
Chunk 28: total samples so f

In [8]:
# testing with smaller sample size 
Subreddit = "politics"
output_file = r"C:\Users\TG2023\OneDrive\Desktop\Uni Konstanz\2. Semester\Social Media Data Analysis\Final Project\politics_with_scores.csv"
samples_politics = pd.read_csv(output_file)
# test = samples_politics.sample(n=40, random_state=42) 
# Generate_Incivility_Ratings(test, Subreddit)

In [None]:
Subreddit = "politics"
Generate_Incivility_Ratings(samples_politics, Subreddit)