In [1]:
import os
import zstandard
import polars as pl
import requests
import json
import tempfile
import re

In [2]:
def download_process_zst(url:str) -> pl.DataFrame:    
    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        # Download
        response = requests.get(url)
        zst_path = os.path.join(temp_dir, "data.zst")
        with open(zst_path, 'wb') as f:
            f.write(response.content)

        # Decompress
        json_path = os.path.join(temp_dir, "data.jsonl")
        with open(zst_path, 'rb') as compressed_file:
            with open(json_path, 'wb') as decompressed_file:
                dctx = zstandard.ZstdDecompressor()
                dctx.copy_stream(compressed_file, decompressed_file)

        # File is in JSONL format
        df = pl.read_ndjson(
            json_path,
            infer_schema_length=10000,
            ignore_errors=True
        )
        
    return df    

print("Downloading and processing data...")
df = download_process_zst("https://the-eye.eu/redarcs/files/truerateme_comments.zst")
print(df.columns)

Downloading and processing data...
['author', 'author_cakeday', 'author_flair_css_class', 'author_flair_text', 'body', 'can_gild', 'collapsed', 'collapsed_reason', 'controversiality', 'created_utc', 'distinguished', 'edited', 'gilded', 'id', 'is_submitter', 'link_id', 'parent_id', 'retrieved_on', 'score', 'stickied', 'subreddit', 'subreddit_id']


In [3]:
df

author,author_cakeday,author_flair_css_class,author_flair_text,body,can_gild,collapsed,collapsed_reason,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,parent_id,retrieved_on,score,stickied,subreddit,subreddit_id
str,bool,null,str,str,bool,bool,str,i64,i64,str,i64,i64,str,bool,str,str,i64,i64,bool,str,str
"""kalkush""",,,"""moderator""","""That's fine 👌🏽""",true,false,,0,1502931307,,0,0,"""dlqb6cr""",false,"""t3_6twkns""","""t3_6twkns""",1504127569,3,false,"""truerateme""","""t5_3noa4"""
"""RodgerDodger509""",,,,"""I know I was givin you a bit o…",true,false,,0,1502961232,,0,0,"""dlqsm1o""",false,"""t3_6twkns""","""t3_6twkns""",1504136099,3,false,"""truerateme""","""t5_3noa4"""
"""Hereforthekek""",,,"""moderator""","""I don't remember you giving me…",true,false,,0,1502961408,,0,0,"""dlqsodo""",true,"""t3_6twkns""","""t1_dlqsm1o""",1504136131,2,false,"""truerateme""","""t5_3noa4"""
"""kalkush""",,,"""moderator""","""Happy to make /u/RodgerDodger5…",true,false,,0,1502967551,,0,0,"""dlqv2p8""",false,"""t3_6twkns""","""t1_dlqsodo""",1504137300,2,false,"""truerateme""","""t5_3noa4"""
"""throwitawaythekey""",,,,"""[Here's another pic.](https://…",true,false,,0,1502970080,,0,0,"""dlqwa3t""",true,"""t3_6u9dkw""","""t3_6u9dkw""",1504137892,22,false,"""truerateme""","""t5_3noa4"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""jerry00193""",,,"""𝕄𝕠𝕕𝕖𝕣𝕒𝕥𝕠𝕣""","""Warning for overrating. Rule 1…",true,false,,0,1672530955,"""moderator""",0,0,"""j2fx4cp""",false,"""t3_zziubr""","""t1_j2efuki""",1673021674,3,false,"""truerateme""","""t5_3noa4"""
"""Bazookajojo69""",,,"""Newbie""","""Ikr he may be wasian""",true,false,,0,1672530988,,0,0,"""j2fx709""",false,"""t3_1004zim""","""t1_j2fwljs""",1673021671,2,false,"""truerateme""","""t5_3noa4"""
"""hufflepuffylol""",,,"""Trusted Rater""","""Compared to other 18 year olds…",true,false,,0,1672531083,,1672531814,0,"""j2fxekp""",false,"""t3_zwo9zq""","""t1_j2fvime""",1673021666,1,false,"""truerateme""","""t5_3noa4"""
"""Bazookajojo69""",,,"""Newbie""","""6 you look like a cool soldier""",true,false,,0,1672531101,,0,0,"""j2fxg1c""",false,"""t3_1002urw""","""t3_1002urw""",1673021664,2,false,"""truerateme""","""t5_3noa4"""


# Define extraction strategy
while many different ways to guess the rating are possible, we choose the mean of all floats ranging from 0 to 10 within the comment text

Alternate methods:
- first float from 0 to 10 within the comment text
- using an LLM or other NLP technique to extract the rating

In [11]:
def extract_all_floats(cmt_text:str) -> list:
    return [float(match[0]) for match in re.findall(r"[+-]?(\d+(\.\d+)?)", cmt_text)]
    
def first_float_extraction(cmt_text:str) -> float:
    return next(filter(lambda x: 0 <= x <= 10, extract_all_floats(cmt_text)), None)

def mean_of_floats_extraction(cmt_text:str):
    floats = extract_all_floats(cmt_text)
    return sum(floats) / len(floats) if floats else None

extraction_method = mean_of_floats_extraction

# Now, join
we choose to join only top-level comments to their respective posts (do this by link id).
this is because we want to 

## weighing techniques
we could do a simple average of all comments. However, instead, we choose to weight the average by the total rating of all comments, ignoring those with negative ratings.

In [14]:
top_level_comments = df.filter(
    pl.col("parent_id") == pl.col("link_id")
)

extract_id = lambda x: re.sub(r"t\d_", "", x)

top_level_comments = top_level_comments.with_columns(
    pl.col("link_id").map_elements(extract_id, pl.String).alias("thread"),             # Extract post ID
    pl.col("body").map_elements(extraction_method, pl.Float32).alias("rating")         # Extract rating
)

rated_comments = top_level_comments.filter(
    pl.col("rating").is_not_null() & 
    (pl.col("rating") >= 0) & 
    (pl.col("rating") <= 10)
)

In [25]:
# Join comments onto posts
reddit_posts = pl.read_parquet("reddit_posts.parquet")

rated_posts = reddit_posts.join(
    rated_comments,
    left_on  = "id",
    right_on = "thread",
    how      = "inner"
).with_columns(
    zeroed_score = pl.when(pl.col("score") < 0).then(0).otherwise(pl.col("score"))
).group_by('id').agg(
    mean_rating     = pl.col("rating").mean().alias("mean_rating"),
    median_rating   = pl.col("rating").median().alias("median_rating"),
    rating_stdev    = pl.col("rating").std().fill_null(0).alias("rating_stdev"),
    weighted_rating = (pl.col("rating") * pl.col("zeroed_score")).sum() / pl.col("zeroed_score").sum(),
    rating_count    = pl.col("rating").count().alias("rating_count"),
)

# Join ratings back onto posts
reddit_posts = reddit_posts.join(
    rated_posts,
    left_on  = "id",
    right_on = "id",
    how      = "left"
).filter(
    (pl.col("rating_count") > 0) &
    (pl.col("local_thumbnail_path") != "")
)

reddit_posts.write_parquet("reddit_posts_rated.parquet")
reddit_posts

id,author,created_utc,subreddit,title,selftext,media_embed,media,url,local_thumbnail_path,mean_rating,median_rating,rating_stdev,weighted_rating,rating_count
str,str,i64,str,str,str,struct[5],struct[3],str,str,f32,f32,f32,f64,u32
"""6um2bk""","""Josh_norman05""",1503103734,"""truerateme""","""[21M] Rateme, I'm 6'1"" and 200…","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2Fv2xU7%2Fembed&amp;url=http%3A%2F%2Fimgur.com%2Fa%2Fv2xU7&amp;image=http%3A%2F%2Fi.imgur.com%2FpCLxBv8.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2Fv2xU7%2Fembed&amp;url=http%3A%2F%2Fimgur.com%2Fa%2Fv2xU7&amp;image=http%3A%2F%2Fi.imgur.com%2FpCLxBv8.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/pCLxBv8.jpg?fb"",600,""Imgur: The most awesome images on the Internet"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""http://imgur.com/a/v2xU7""","""thumbnails/6um2bk.jpg""",6.777778,7.0,1.141849,6.838235,9
"""6umpik""","""PettingXu""",1503111646,"""truerateme""","""M23 tear me a new one guys""","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2F2loQ6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F2loQ6&amp;image=http%3A%2F%2Fi.imgur.com%2F7HP3ftQ.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2F2loQ6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F2loQ6&amp;image=http%3A%2F%2Fi.imgur.com%2F7HP3ftQ.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/7HP3ftQ.jpg?fb"",600,""Reddit Rateme"",""rich"",null,""1.0"",550,null,null},""m.imgur.com"",null}","""https://m.imgur.com/a/2loQ6""","""thumbnails/6umpik.jpg""",7.104167,7.125,0.532197,7.235632,8
"""6un283""","""Mr-Rushifa""",1503116342,"""truerateme""","""[22M] 6'6"" Was told to post he…","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FD0rJP%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FD0rJP&amp;image=http%3A%2F%2Fi.imgur.com%2FHkIQCm5.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FD0rJP%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FD0rJP&amp;image=http%3A%2F%2Fi.imgur.com%2FHkIQCm5.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/HkIQCm5.jpg?fb"",600,""Gallery"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""https://imgur.com/a/D0rJP""","""thumbnails/6un283.jpg""",8.814661,9.0,0.68085,8.543662,22
"""6un2v8""","""mindmonkey00""",1503116572,"""truerateme""","""[M 23] Rate my ginger ass and …","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FRa0M6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FRa0M6&amp;image=http%3A%2F%2Fi.imgur.com%2FESsgdIW.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FRa0M6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FRa0M6&amp;image=http%3A%2F%2Fi.imgur.com%2FESsgdIW.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/ESsgdIW.jpg?fb"",600,""Imgur: The most awesome images on the Internet"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""https://imgur.com/a/Ra0M6""","""thumbnails/6un2v8.jpg""",7.358334,8.0,2.37952,7.166667,10
"""6upkv5""","""Mest666""",1503155600,"""truerateme""","""M25, seems like place for brav…","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2F60OLb%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F60OLb&amp;image=http%3A%2F%2Fi.imgur.com%2FoYkY1Nb.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2F60OLb%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F60OLb&amp;image=http%3A%2F%2Fi.imgur.com%2FoYkY1Nb.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/oYkY1Nb.jpg?fb"",600,""Imgur: The most awesome images on the Internet"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""https://imgur.com/a/60OLb""","""thumbnails/6upkv5.jpg""",5.95,6.625,1.690661,5.784722,10
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""yhuzth""","""ElsaOutlaw""",1667175103,"""truerateme""","""F33""","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm&amp;image=https%3A%2F%2Fi.imgur.com%2FEsjpvnq.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""490"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",490,null,false,600}","{{null,""Post with 0 views."",490,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm&amp;image=https%3A%2F%2Fi.imgur.com%2FEsjpvnq.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""490"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/Esjpvnq.jpg?fb"",600,""Imgur"",""rich"",""https://imgur.com/a/S3trDWm"",""1.0"",600,null,null},""imgur.com"",null}","""https://imgur.com/a/S3trDWm""","""thumbnails/yhuzth.jpg""",5.416667,5.0,1.076904,5.416667,5
"""yski07""","""JustADumbThrowAway2""",1668195140,"""truerateme""","""26M Going on a date for the fi…","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2&amp;image=https%3A%2F%2Fi.imgur.com%2F49cxFH7.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""1122"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",1122,null,false,600}","{{null,""Post with 10 votes and 10 views. Shared by LovesDick35. ."",1122,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2&amp;image=https%3A%2F%2Fi.imgur.com%2F49cxFH7.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""1122"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/49cxFH7.jpg?fb"",600,""."",""rich"",""https://imgur.com/a/8zViwt2"",""1.0"",600,"""",null},""imgur.com"",null}","""https://imgur.com/gallery/8zVi…","""thumbnails/yski07.jpg""",4.2,4.2,0.0,4.2,1
"""z42v1x""","""oirawsh""",1669347539,"""truerateme""","""M20 Rating and suggestions ple…","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FhtWNojp%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FhtWNojp&amp;image=https%3A%2F%2Fi.imgur.com%2FsKh0tbr.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""840"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",840,null,false,600}","{{null,""Post with 1 views."",840,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FhtWNojp%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FhtWNojp&amp;image=https%3A%2F%2Fi.imgur.com%2FsKh0tbr.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""840"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/sKh0tbr.jpg?fb"",600,""Imgur"",""rich"",""https://imgur.com/a/htWNojp"",""1.0"",600,null,null},""imgur.com"",null}","""https://imgur.com/a/htWNojp/""","""thumbnails/z42v1x.jpg""",5.5,5.5,0.707107,5.0,2
"""z4czi2""","""hudeyu""",1669381675,"""truerateme""","""[F 31] Be as honest as possibl…","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp&amp;image=https%3A%2F%2Fi.imgur.com%2F1WvOvVT.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""640"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",640,null,false,600}","{{null,""Post with 43 views."",640,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp&amp;image=https%3A%2F%2Fi.imgur.com%2F1WvOvVT.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""640"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/1WvOvVT.jpg?fb"",600,""Imgur"",""rich"",""https://imgur.com/a/COPvEQp"",""1.0"",600,null,null},""imgur.com"",null}","""https://imgur.com/a/COPvEQp""","""thumbnails/z4czi2.jpg""",4.840625,4.9,0.491706,4.965625,8


In [28]:
# Now, filter out all imgur image not found pics
# TODO: this should be done before everything in the other notebook
import os
from PIL import Image
def is_imgur_imagenotfound(path:str):
    file_size = os.path.getsize(path)
    
    with Image.open(path) as img:
        width, height = img.size
    
    return file_size == 503 and width == 161 and height == 81

In [29]:
is_imgur_imagenotfound("thumbnails/6uohb4.jpg")

True

In [30]:
# Filter out imgur image not found pics
reddit_posts = reddit_posts.filter(
    (pl.col("local_thumbnail_path").map_elements(is_imgur_imagenotfound, pl.Boolean) == False)
)

reddit_posts.write_parquet("reddit_posts_rated.parquet")
reddit_posts

id,author,created_utc,subreddit,title,selftext,media_embed,media,url,local_thumbnail_path,mean_rating,median_rating,rating_stdev,weighted_rating,rating_count
str,str,i64,str,str,str,struct[5],struct[3],str,str,f32,f32,f32,f64,u32
"""6um2bk""","""Josh_norman05""",1503103734,"""truerateme""","""[21M] Rateme, I'm 6'1"" and 200…","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2Fv2xU7%2Fembed&amp;url=http%3A%2F%2Fimgur.com%2Fa%2Fv2xU7&amp;image=http%3A%2F%2Fi.imgur.com%2FpCLxBv8.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2Fv2xU7%2Fembed&amp;url=http%3A%2F%2Fimgur.com%2Fa%2Fv2xU7&amp;image=http%3A%2F%2Fi.imgur.com%2FpCLxBv8.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/pCLxBv8.jpg?fb"",600,""Imgur: The most awesome images on the Internet"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""http://imgur.com/a/v2xU7""","""thumbnails/6um2bk.jpg""",6.777778,7.0,1.141849,6.838235,9
"""6umpik""","""PettingXu""",1503111646,"""truerateme""","""M23 tear me a new one guys""","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2F2loQ6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F2loQ6&amp;image=http%3A%2F%2Fi.imgur.com%2F7HP3ftQ.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2F2loQ6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F2loQ6&amp;image=http%3A%2F%2Fi.imgur.com%2F7HP3ftQ.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/7HP3ftQ.jpg?fb"",600,""Reddit Rateme"",""rich"",null,""1.0"",550,null,null},""m.imgur.com"",null}","""https://m.imgur.com/a/2loQ6""","""thumbnails/6umpik.jpg""",7.104167,7.125,0.532197,7.235632,8
"""6un283""","""Mr-Rushifa""",1503116342,"""truerateme""","""[22M] 6'6"" Was told to post he…","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FD0rJP%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FD0rJP&amp;image=http%3A%2F%2Fi.imgur.com%2FHkIQCm5.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FD0rJP%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FD0rJP&amp;image=http%3A%2F%2Fi.imgur.com%2FHkIQCm5.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/HkIQCm5.jpg?fb"",600,""Gallery"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""https://imgur.com/a/D0rJP""","""thumbnails/6un283.jpg""",8.814661,9.0,0.68085,8.543662,22
"""6un2v8""","""mindmonkey00""",1503116572,"""truerateme""","""[M 23] Rate my ginger ass and …","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FRa0M6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FRa0M6&amp;image=http%3A%2F%2Fi.imgur.com%2FESsgdIW.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FRa0M6%2Fembed&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FRa0M6&amp;image=http%3A%2F%2Fi.imgur.com%2FESsgdIW.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/ESsgdIW.jpg?fb"",600,""Imgur: The most awesome images on the Internet"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""https://imgur.com/a/Ra0M6""","""thumbnails/6un2v8.jpg""",7.358334,8.0,2.37952,7.166667,10
"""6uprsr""","""Zyros_""",1503157729,"""truerateme""","""[29M] How is it right now? Did…","""""","{""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FHCOJO%2Fembed&amp;url=http%3A%2F%2Fimgur.com%2Fa%2FHCOJO&amp;image=http%3A%2F%2Fi.imgur.com%2Ffq9ABPF.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",550,null,false,550}","{{null,""Imgur: The most awesome images on the Internet."",550,""&lt;iframe class=""embedly-embed"" src=""//cdn.embedly.com/widgets/media.html?src=%2F%2Fimgur.com%2Fa%2FHCOJO%2Fembed&amp;url=http%3A%2F%2Fimgur.com%2Fa%2FHCOJO&amp;image=http%3A%2F%2Fi.imgur.com%2Ffq9ABPF.jpg%3Ffb&amp;key=522baf40bd3911e08d854040d3dc5c07&amp;type=text%2Fhtml&amp;schema=imgur"" width=""550"" height=""550"" scrolling=""no"" frameborder=""0"" allowfullscreen&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""http://i.imgur.com/fq9ABPF.jpg?fb"",600,""Imgur: The most awesome images on the Internet"",""rich"",null,""1.0"",550,null,null},""imgur.com"",null}","""http://imgur.com/a/HCOJO""","""thumbnails/6uprsr.jpg""",7.639057,8.0,0.98668,7.995351,19
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ybb4pn""","""Billnben__5""",1666509829,"""truerateme""","""22M First 2 photos with phone …","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FcMuctUz%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FcMuctUz&amp;image=https%3A%2F%2Fi.imgur.com%2FEw3uZ77.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""377"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",377,null,false,600}","{{null,""Post with 0 views."",377,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FcMuctUz%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FcMuctUz&amp;image=https%3A%2F%2Fi.imgur.com%2FEw3uZ77.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""377"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/Ew3uZ77.jpg?fb"",600,""Imgur"",""rich"",""https://imgur.com/a/cMuctUz"",""1.0"",600,null,null},""imgur.com"",null}","""https://imgur.com/a/cMuctUz""","""thumbnails/ybb4pn.jpg""",6.1,6.1,0.0,6.1,1
"""yhuzth""","""ElsaOutlaw""",1667175103,"""truerateme""","""F33""","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm&amp;image=https%3A%2F%2Fi.imgur.com%2FEsjpvnq.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""490"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",490,null,false,600}","{{null,""Post with 0 views."",490,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FS3trDWm&amp;image=https%3A%2F%2Fi.imgur.com%2FEsjpvnq.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""490"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/Esjpvnq.jpg?fb"",600,""Imgur"",""rich"",""https://imgur.com/a/S3trDWm"",""1.0"",600,null,null},""imgur.com"",null}","""https://imgur.com/a/S3trDWm""","""thumbnails/yhuzth.jpg""",5.416667,5.0,1.076904,5.416667,5
"""yski07""","""JustADumbThrowAway2""",1668195140,"""truerateme""","""26M Going on a date for the fi…","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2&amp;image=https%3A%2F%2Fi.imgur.com%2F49cxFH7.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""1122"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",1122,null,false,600}","{{null,""Post with 10 votes and 10 views. Shared by LovesDick35. ."",1122,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2F8zViwt2&amp;image=https%3A%2F%2Fi.imgur.com%2F49cxFH7.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""1122"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/49cxFH7.jpg?fb"",600,""."",""rich"",""https://imgur.com/a/8zViwt2"",""1.0"",600,"""",null},""imgur.com"",null}","""https://imgur.com/gallery/8zVi…","""thumbnails/yski07.jpg""",4.2,4.2,0.0,4.2,1
"""z4czi2""","""hudeyu""",1669381675,"""truerateme""","""[F 31] Be as honest as possibl…","""""","{""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp&amp;image=https%3A%2F%2Fi.imgur.com%2F1WvOvVT.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""640"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",640,null,false,600}","{{null,""Post with 43 views."",640,""&lt;iframe class=""embedly-embed"" src=""https://cdn.embedly.com/widgets/media.html?src=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp%2Fembed%3Fpub%3Dtrue%26ref%3Dhttps%253A%252F%252Fembed.ly%26w%3D900&amp;display_name=Imgur&amp;url=https%3A%2F%2Fimgur.com%2Fa%2FCOPvEQp&amp;image=https%3A%2F%2Fi.imgur.com%2F1WvOvVT.jpg%3Ffb&amp;key=2aa3c4d5f3de4f5b9120b660ad850dc9&amp;type=text%2Fhtml&amp;schema=imgur"" width=""600"" height=""640"" scrolling=""no"" title=""Imgur embed"" frameborder=""0"" allow=""autoplay; fullscreen"" allowfullscreen=""true""&gt;&lt;/iframe&gt;"",""Imgur"",""http://imgur.com"",315,""https://i.imgur.com/1WvOvVT.jpg?fb"",600,""Imgur"",""rich"",""https://imgur.com/a/COPvEQp"",""1.0"",600,null,null},""imgur.com"",null}","""https://imgur.com/a/COPvEQp""","""thumbnails/z4czi2.jpg""",4.840625,4.9,0.491706,4.965625,8
