In [8]:
import polars as pl
from utils import augmented_comments, build_all_prompts, run_inference_sglang

df = augmented_comments()

In [6]:
comments_df = df.filter(pl.col("type") == "comment").sample(n=100000, seed=42)
comments_df = comments_df.with_columns(
    pl.Series("prompt", build_all_prompts(comments_df["id"])),
)

df

Building prompts: 100%|██████████| 100000/100000 [00:58<00:00, 1698.95it/s]


id,type,by,time,title,text,url,score,parent,top_level_parent,descendants,kids,deleted,dead,prompt
i64,str,str,datetime[μs],str,str,str,i64,i64,i64,i64,list[i64],bool,bool,str
7209407,"""comment""","""ricardobeat""",2014-02-10 06:00:25,,"""The article presents some (non…",,,7207821,7207506,,"[7210958, 7213827]",,,"""<instructions>Your goal is to …"
13520503,"""comment""","""throwawayish""",2017-01-30 12:11:14,,"""Reaaally not that simple.""",,,13519742,13517389,,[13528758],,,"""<instructions>Your goal is to …"
13152625,"""comment""","""spikels""",2016-12-11 19:19:47,,"""That&#x27;s an exaggeration. A…",,,13151772,13147495,,[13165894],,,"""<instructions>Your goal is to …"
30957389,"""comment""","""ss108""",2022-04-08 14:38:05,,"""Ah, you&#x27;re probably right…",,,30956591,30955290,,,,,"""<instructions>Your goal is to …"
23335760,"""comment""","""en3r0""",2020-05-28 10:41:15,,"""I have been using Trilium late…",,,23335759,23335759,,[23335860],,,"""<instructions>Your goal is to …"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
8110666,"""comment""","""erickookoo""",2014-07-30 20:17:48,,"""Part of the reason we built th…",,,8109374,8109114,,,,,"""<instructions>Your goal is to …"
8039481,"""comment""","""josteink""",2014-07-15 22:22:52,,"""If you&#x27;re worried Google …",,,8039322,8038990,,,,,"""<instructions>Your goal is to …"
37965302,"""comment""","""williamdclt""",2023-10-21 09:21:21,,"""Do you have an example?""",,,37965257,37962370,,"[37965954, 37965936]",,,"""<instructions>Your goal is to …"
10112234,"""comment""","""afshin""",2015-08-24 18:58:41,,"""Just a guess, but it might be …",,,10111922,10108472,,[10112273],,,"""<instructions>Your goal is to …"


In [7]:
rewards = run_inference_sglang(comments_df["prompt"])

comments_df = comments_df.with_columns(pl.Series("reward", rewards))

comments_df.write_parquet("./data/random_comments_with_reward.parquet")

comments_df

Running inference: 100%|██████████| 1000/1000 [23:29<00:00,  1.41s/it]


id,type,by,time,title,text,url,score,parent,top_level_parent,descendants,kids,deleted,dead,prompt,reward
i64,str,str,datetime[μs],str,str,str,i64,i64,i64,i64,list[i64],bool,bool,str,f64
7209407,"""comment""","""ricardobeat""",2014-02-10 06:00:25,,"""The article presents some (non…",,,7207821,7207506,,"[7210958, 7213827]",,,"""<instructions>Your goal is to …",-4.71875
13520503,"""comment""","""throwawayish""",2017-01-30 12:11:14,,"""Reaaally not that simple.""",,,13519742,13517389,,[13528758],,,"""<instructions>Your goal is to …",-15.125
13152625,"""comment""","""spikels""",2016-12-11 19:19:47,,"""That&#x27;s an exaggeration. A…",,,13151772,13147495,,[13165894],,,"""<instructions>Your goal is to …",5.4375
30957389,"""comment""","""ss108""",2022-04-08 14:38:05,,"""Ah, you&#x27;re probably right…",,,30956591,30955290,,,,,"""<instructions>Your goal is to …",-21.0
23335760,"""comment""","""en3r0""",2020-05-28 10:41:15,,"""I have been using Trilium late…",,,23335759,23335759,,[23335860],,,"""<instructions>Your goal is to …",-20.125
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
8110666,"""comment""","""erickookoo""",2014-07-30 20:17:48,,"""Part of the reason we built th…",,,8109374,8109114,,,,,"""<instructions>Your goal is to …",0.484375
8039481,"""comment""","""josteink""",2014-07-15 22:22:52,,"""If you&#x27;re worried Google …",,,8039322,8038990,,,,,"""<instructions>Your goal is to …",-12.125
37965302,"""comment""","""williamdclt""",2023-10-21 09:21:21,,"""Do you have an example?""",,,37965257,37962370,,"[37965954, 37965936]",,,"""<instructions>Your goal is to …",-22.25
10112234,"""comment""","""afshin""",2015-08-24 18:58:41,,"""Just a guess, but it might be …",,,10111922,10108472,,[10112273],,,"""<instructions>Your goal is to …",-7.6875


In [6]:
import polars as pl

comments_df = pl.read_parquet("./data/random_comments_with_reward.parquet")

In [7]:
import html
import re
from utils import with_story_info


def unescape_html(text):
    unescaped = html.unescape(text).replace("<p>", "\n\n")
    return re.sub(r'<a href="([^"]+)"[^>]*>[^<]+</a>', r"\1", unescaped)


comments_df = with_story_info(comments_df)

comments_df = comments_df.with_columns(
    pl.concat_str(
        pl.lit("https://news.ycombinator.com/item?id="),
        pl.col("id"),
    ).alias("link"),
    pl.col("time").dt.strftime("%B %d, %Y").alias("date"),
    pl.col("text").map_elements(unescape_html, return_dtype=pl.String),
)

comments_df.select(pl.col("date", "by", "link", "text", "reward")).sort(
    "reward", descending=True
).head(100).write_csv("./data/top_random_comments_with_links.csv")

comments_df.select(pl.col("date", "by", "link", "text", "reward")).sort(
    "reward", descending=False
).head(100).write_csv("./data/bottom_random_comments_with_links.csv")
