In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_set = load_dataset("trl-lib/tldr")

In [3]:
ds = {}
for set_type in data_set.keys():
  ds[set_type] = pd.DataFrame(data_set[set_type])
  ds[set_type]["prompt_post"] = ds[set_type].prompt.str.extract(
      r'POST: ((.|\n)*)\nTL;DR:', expand=False
  ).iloc[:, 0]
  ds[set_type]["prompt_title"] = ds[set_type].prompt.str.extract(
    r'TITLE: ((.|\n)*)\n\nPOST:', expand=False
  ).iloc[:, 0]
  ds[set_type]["subreddit"] = ds[set_type].prompt.str.extract(
    r'SUBREDDIT: ((.|\n)*)\n\nTITLE:', expand=False
  ).iloc[:, 0]

In [4]:
import re
train_set = ds["train"]
train_set["prompt_post"] = train_set["prompt_post"].str.replace(r'\s*TL;DR:\s*', '', regex=True)
train_set["completion"] = [
    re.sub(r'\n[ \t]+', '\n', re.sub(r'^[ \t]+', '', t))
    for t in train_set["completion"]
]
print(train_set.columns)
print(train_set.iloc[1234]["prompt_post"])

Index(['prompt', 'completion', 'prompt_post', 'prompt_title', 'subreddit'], dtype='object')
My 25 year old son has a job which means he does 12 hour nightshifts 3-4 times a week, including weekends. After his nightshifts, he comes to my house an sleeps in the spare bedroom. He sleeps in my house because he says at his own house, it's too noisy. 

In his own house, he has his partner, who is mostly home during weekdays as she only works weekends. They then have a one year old, then his partner has a 14 year old girl who - if it's the weekends or after school, always has at least 2 friends with her. It's the group of girls who are the noisiest. So the house is very noisy. 

I don't have a problem with my son sleeping in my spare room, but I think as his mother I should try and encourage him to have a better arrangement set up in his own home so he can sleep there, for his sake. For example, if the 14 year old does have friends over, she could only have them downstairs, and they'd not be 

In [5]:
import pandas as pd

# Separate AskReddit from other subreddits
askreddit_df = train_set[train_set['subreddit'] == 'r/AskReddit']
other_df = train_set[train_set['subreddit'] != 'r/AskReddit']

# Downsample other subreddits to max 2000 each
downsampled_other = other_df.groupby('subreddit', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), 2000), random_state=42)
)

# Combine back together
train_set = pd.concat([askreddit_df, downsampled_other], ignore_index=True)
train_set = train_set.sample(frac=1, random_state=42).reset_index(drop=True)
# print(train_set['subreddit'].value_counts())

  downsampled_other = other_df.groupby('subreddit', group_keys=False).apply(


In [6]:
print(len(train_set))
train_set = train_set.drop_duplicates(["prompt_post"])
print(len(train_set))

41773
41601


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf = TfidfVectorizer()
from sklearn.decomposition import TruncatedSVD
tfidf_mat = tfidf.fit_transform(train_set["prompt_post"])
print(tfidf_mat.shape)

(41601, 62384)


In [9]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def knn_greedy_clustering(tfidf_matrix, min_size=2, max_size=4, 
                         threshold=0.7, n_neighbors=20):
    """
    Use k-NN for fast neighbor finding, then greedy cluster formation
    """
    n = tfidf_matrix.shape[0]
    
    # Fit k-NN model (cosine similarity)
    nbrs = NearestNeighbors(n_neighbors=min(n_neighbors + 1, n),  # +1 for self
                           metric='cosine', 
                           algorithm='auto')
    nbrs.fit(tfidf_matrix)
    
    # Get neighbors for all samples at once
    distances, indices = nbrs.kneighbors(tfidf_matrix)
    similarities = 1 - distances  # Convert distance to similarity
    
    # Greedy clustering using precomputed neighbors
    assigned = np.zeros(n, dtype=bool)
    clusters = []
    
    # Process in order of maximum similarity (samples with strong matches first)
    max_sims = similarities[:, 1].copy()  # Skip self (index 0)
    order = np.argsort(max_sims)[::-1]
    
    for idx in order:
        if assigned[idx]:
            continue
        
        cluster = [int(idx)]  # Convert to Python int immediately
        
        # Add neighbors that are unassigned and above threshold
        # Start from index 1 to skip self (index 0 is always the sample itself)
        for neighbor_idx, sim in zip(indices[idx][1:], similarities[idx][1:]):
            neighbor_idx = int(neighbor_idx)  # Convert to Python int
            
            if neighbor_idx != idx and not assigned[neighbor_idx] and sim >= threshold and sim <0.95:
                cluster.append(neighbor_idx)
                if len(cluster) >= max_size:
                    break
        
        # Only add cluster if it meets minimum size
        if len(cluster) >= min_size:
            clusters.append(cluster)
            assigned[cluster] = True
        # If cluster too small, leave sample unassigned for now
    
    return clusters, assigned

# Usage
clusters, assigned = knn_greedy_clustering(
    tfidf_mat, 
    min_size=2, 
    max_size=4, 
    threshold=0.3,
    n_neighbors=50
)

# Verify no single-element clusters
print(f"Any size-1 clusters? {any(len(c) == 1 for c in clusters)}")
print(f"Clustered: {assigned.sum()} / {len(assigned)}")
print(f"Unassigned: {(~assigned).sum()}")

Any size-1 clusters? False
Clustered: 20487 / 41601
Unassigned: 21114


In [10]:
import numpy as np

cluster_sizes = [len(c) for c in clusters]
unique, counts = np.unique(cluster_sizes, return_counts=True)

print("Cluster size distribution:")
for size, count in zip(unique, counts):
    print(f"  Size {size}: {count} clusters")

Cluster size distribution:
  Size 2: 3121 clusters
  Size 3: 1047 clusters
  Size 4: 2776 clusters


In [11]:
print(clusters[0])

[38596, 21353, 29682, 16046]


In [12]:
print(train_set['completion'].iloc[36376])
#print(train_set['completion'].iloc[34728])
print(train_set['completion'].iloc[18913])
print(train_set['completion'].iloc[6146])

hit a careless jaywalker at around 20mph going thru an intersection on my green. I drove abiding all traffic laws. What can be the outcome?
2 jobs, the first $65k train driver or $?? IT position with no formal job title yet but he did mention working under him as an admin assistant.
I need a new laptop and I might be fed up with Windows. Show me what's best all around without any specific criteria.


In [13]:
print(train_set["prompt_post"].iloc[11304])
print(train_set["prompt_post"].iloc[7008])

Wow, there really is a subreddit for everything. Ok well if someone who knows about this could help me out i would appreciate it.

I few month ago I heard about how Ireland adopted A.C.T.A ( or was it sopa?) anyway just wile skimming and article I heard that this gave them the right to search mp3 players, ipods, and laptops for pirated music in the same way they can search your bag for drugs. now i didn't pay attention to the credibility of the source at the time because i hadn't seen going to Ireland in the foreseeable future but something has come up and I am headed there this December. If this is true it will mean the difference between be bringing my ipod and laptop or not.

I have been doing some research but it is the internet and its difficult to find a credible answer to any obscure question like this.

So this guy was my friend in college. I am really good friends with his girlfriend and to make a long story short we betrayed him and we had sex. So she feels really bad about t

## Create the new dataset based on the clusters and the prepends

In [27]:
import sys
from pathlib import Path

# Go one directory up from notebook location
notebook_dir = Path().resolve()
project_root = notebook_dir.parent

# Add to path
sys.path.append(str(project_root))

from src.tldr_prepend import Prepends

processed = {"transscript": [], "TLDR": []}

prep = Prepends()

for group in clusters:
    processed["TLDR"].append("")
    processed["transscript"].append("")
    for i, text_idx in enumerate(group):
        processed["TLDR"][-1] += f"{prep.get_random_prepend(i+1)}\n{train_set['completion'].iloc[text_idx]}\n\n"
        # format transscripts
        processed["transscript"][-1] += f"TITLE_OF_VIDEO_{i+1}: {train_set["prompt_title"].iloc[text_idx]}\n"
        processed["transscript"][-1] += f"TRANSCRIPT_OF_VIDEO_RESULT_{i + 1}: {train_set['prompt_post'].iloc[text_idx]}\n"

processed_df = pd.DataFrame(processed)


KeyError: 0

In [28]:
print(processed_df["TLDR"][1])
print(processed_df["transscript"][1])

Transcript 1 was about the following:
As a first timer I cant work up the courage to go out with an amazing girl because she has had some bad relationships and is moving 2 hours away in the fall.

Additionally, video 2 covered:
There's a girl who I think is perfect for me, and almost certain she'd say yes if I asked her out, but I leave for school in week and it might be too far to have a good relationship.

Video 3 delved into:
My girlfriend is threatening to break up with me over our college decisions, and I don't know if I want her to or not.

Transcript 4 discussed:
Girlfriends mother coerced girlfriend to go on a 3 month break. GF has agreed but hopes to resume our relationship after she comes back from the 3 months of working to do things for herself and not be so dependent.


TITLE_OF_VIDEO_1: I (M17) need some advice on asking a girl (F18) out as a first timer, but there are a few problems.
TRANSCRIPT_OF_VIDEO_RESULT_1: So I (a 17 yo. male) have never asked a girl out before ev

In [16]:
len(processed_df)

6944

In [20]:
print(processed_df.transscript[1])

So I (a 17 yo. male) have never asked a girl out before ever. I have no problems talking with them, but getting any deeper is no good. So I have just been that single guy not asking anyone out or anything of that sort.
So on to the task before me: I'm pretty sure I found "the" one, or at least a one, that I am crazy over. She's 18 and we're good friends. We have a lot of similar interests with the biggest one being music.
The biggest issues I see getting in a relationship with her are that she's graduating this year and moving to college 2 hours away in the fall.

The other problem is that her previous relationships have been terrible with terrible guys. She swore off dating for the rest of high school and I don't want to come off as an ass for starting with essentially "Hey you're not in high school anymore, want to go out?" as that is what I feel like asking her out right now would be saying.
All that said, she is an amazing person and is really everything I'm looking for in a girl.


In [34]:
import json

SYSTEM_PROMPT = "You summarize multiple video transcripts into concise, factual summaries."

with open("../data/proc_tldr.jsonl", "w", encoding="utf-8") as f:
    for _, row in processed_df.iterrows():
        example = {
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": row["transscript"]},
                {"role": "assistant", "content": row["TLDR"]},
            ]
        }
        f.write(json.dumps(example, ensure_ascii=False) + "\n")
