In [None]:
import os
import zstandard
import polars as pl
import requests
import json
import tempfile
import re

In [None]:
def download_process_zst(url:str) -> pl.DataFrame:    
    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        # Download
        response = requests.get(url)
        zst_path = os.path.join(temp_dir, "data.zst")
        with open(zst_path, 'wb') as f:
            f.write(response.content)

        # Decompress
        json_path = os.path.join(temp_dir, "data.jsonl")
        with open(zst_path, 'rb') as compressed_file:
            with open(json_path, 'wb') as decompressed_file:
                dctx = zstandard.ZstdDecompressor()
                dctx.copy_stream(compressed_file, decompressed_file)

        # File is in JSONL format
        df = pl.read_ndjson(
            json_path,
            infer_schema_length=10000,
            ignore_errors=True
        )
        
    return df    

print("Downloading and processing data...")
df = download_process_zst("https://the-eye.eu/redarcs/files/truerateme_comments.zst")
print(df.columns)

In [None]:
df

# Define extraction strategy
while many different ways to guess the rating are possible, we choose the mean of all floats ranging from 0 to 10 within the comment text

Alternate methods:
- first float from 0 to 10 within the comment text
- using an LLM or other NLP technique to extract the rating

In [None]:
def extract_all_floats(cmt_text:str) -> list:
    return [float(match[0]) for match in re.findall(r"[+-]?(\d+(\.\d+)?)", cmt_text)]
    
def first_float_extraction(cmt_text:str) -> float:
    return next(filter(lambda x: 0 <= x <= 10, extract_all_floats(cmt_text)), None)

def mean_of_floats_extraction(cmt_text:str):
    floats = extract_all_floats(cmt_text)
    return sum(floats) / len(floats) if floats else None

extraction_method = mean_of_floats_extraction

# Now, join
we choose to join only top-level comments to their respective posts (do this by link id).
this is because we want to 

## weighing techniques
we could do a simple average of all comments. However, instead, we choose to weight the average by the total rating of all comments, ignoring those with negative ratings.

In [None]:
top_level_comments = df.filter(
    pl.col("parent_id") == pl.col("link_id")
)

extract_id = lambda x: re.sub(r"t\d_", "", x)

top_level_comments = top_level_comments.with_columns(
    pl.col("link_id").map_elements(extract_id, pl.String).alias("thread"),             # Extract post ID
    pl.col("body").map_elements(extraction_method, pl.Float32).alias("rating")         # Extract rating
)

rated_comments = top_level_comments.filter(
    pl.col("rating").is_not_null() & 
    (pl.col("rating") >= 0) & 
    (pl.col("rating") <= 10)
)

In [None]:
# Join comments onto posts
reddit_posts = pl.read_parquet("reddit_posts.parquet")

rated_posts = reddit_posts.join(
    rated_comments,
    left_on  = "id",
    right_on = "thread",
    how      = "inner"
).with_columns(
    zeroed_score = pl.when(pl.col("score") < 0).then(0).otherwise(pl.col("score"))
).group_by('id').agg(
    mean_rating     = pl.col("rating").mean().alias("mean_rating"),
    median_rating   = pl.col("rating").median().alias("median_rating"),
    rating_stdev    = pl.col("rating").std().fill_null(0).alias("rating_stdev"),
    weighted_rating = (pl.col("rating") * pl.col("zeroed_score")).sum() / pl.col("zeroed_score").sum(),
    rating_count    = pl.col("rating").count().alias("rating_count"),
)

# Join ratings back onto posts
reddit_posts = reddit_posts.join(
    rated_posts,
    left_on  = "id",
    right_on = "id",
    how      = "left"
).filter(
    (pl.col("rating_count") > 0) &
    (pl.col("local_thumbnail_path") != "")
)

reddit_posts.write_parquet("reddit_posts_rated.parquet")
reddit_posts

In [None]:
# Now, filter out all imgur image not found pics
# TODO: this should be done before everything in the other notebook
import os
from PIL import Image
def is_imgur_imagenotfound(path:str):
    file_size = os.path.getsize(path)
    
    with Image.open(path) as img:
        width, height = img.size
    
    return file_size == 503 and width == 161 and height == 81

In [None]:
is_imgur_imagenotfound("thumbnails/6uohb4.jpg")

In [None]:
# Filter out imgur image not found pics
reddit_posts = reddit_posts.filter(
    (pl.col("local_thumbnail_path").map_elements(is_imgur_imagenotfound, pl.Boolean) == False)
)

reddit_posts.write_parquet("reddit_posts_rated.parquet")
reddit_posts