In [2]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_set = load_dataset("trl-lib/tldr")

In [4]:
ds = {}
for set_type in data_set.keys():
  ds[set_type] = pd.DataFrame(data_set[set_type])
  ds[set_type]["prompt_post"] = ds[set_type].prompt.str.extract(
      r'POST: ((.|\n)*)\nTL;DR:', expand=False
  ).iloc[:, 0]
  ds[set_type]["prompt_title"] = ds[set_type].prompt.str.extract(
    r'TITLE: ((.|\n)*)\n\nPOST:', expand=False
  ).iloc[:, 0]
  ds[set_type]["subreddit"] = ds[set_type].prompt.str.extract(
    r'SUBREDDIT: ((.|\n)*)\n\nTITLE:', expand=False
  ).iloc[:, 0]

In [5]:
train_set = ds["train"]
print(train_set.columns)
print(train_set.iloc[1235]["prompt"])

Index(['prompt', 'completion', 'prompt_post', 'prompt_title', 'subreddit'], dtype='object')
SUBREDDIT: r/AskReddit

TITLE: My gf's boss sexually harassed me and now im not sure what to do?

POST: My Gf and i were exchanging sexual texts as well as pictures and other deeply personal things. 

I received a very sexual message that seemed "off" and wrote "?"
this was the response i got:

"hey fobygrassman, this is {gf's coworker}, {gf's boss} got a hold of {gfs} phone and decided to fuck with you lol she knows nothing about it just so you know...he made it all up lol"

i wrote back: "np tell him i'll forward this to human resources and to expect a call from them explaining why he is going through his employee's personal belongings. Also i hope he has friends in the labor dpt. because i know i do"

TL;DR:


In [6]:
import pandas as pd

# Separate AskReddit from other subreddits
askreddit_df = train_set[train_set['subreddit'] == 'r/AskReddit']
other_df = train_set[train_set['subreddit'] != 'r/AskReddit']

# Downsample other subreddits to max 2000 each
downsampled_other = other_df.groupby('subreddit', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), 2000), random_state=42)
)

# Combine back together
train_set = pd.concat([askreddit_df, downsampled_other], ignore_index=True)
train_set = train_set.sample(frac=1, random_state=42).reset_index(drop=True)
# print(train_set['subreddit'].value_counts())

  downsampled_other = other_df.groupby('subreddit', group_keys=False).apply(


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf = TfidfVectorizer()
from sklearn.decomposition import TruncatedSVD
tfidf_mat = tfidf.fit_transform(train_set["prompt"])
print(tfidf_mat.shape)

(41773, 63679)


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def knn_greedy_clustering(tfidf_matrix, min_size=2, max_size=4, 
                         threshold=0.7, n_neighbors=20):
    """
    Use k-NN for fast neighbor finding, then greedy cluster formation
    """
    n = tfidf_matrix.shape[0]
    
    # Fit k-NN model (cosine similarity)
    nbrs = NearestNeighbors(n_neighbors=min(n_neighbors + 1, n),  # +1 for self
                           metric='cosine', 
                           algorithm='auto')
    nbrs.fit(tfidf_matrix)
    
    # Get neighbors for all samples at once
    distances, indices = nbrs.kneighbors(tfidf_matrix)
    similarities = 1 - distances  # Convert distance to similarity
    
    # Greedy clustering using precomputed neighbors
    assigned = np.zeros(n, dtype=bool)
    clusters = []
    
    # Process in order of maximum similarity (samples with strong matches first)
    max_sims = similarities[:, 1].copy()  # Skip self (index 0)
    order = np.argsort(max_sims)[::-1]
    
    for idx in order:
        if assigned[idx]:
            continue
        
        cluster = [int(idx)]  # Convert to Python int immediately
        
        # Add neighbors that are unassigned and above threshold
        # Start from index 1 to skip self (index 0 is always the sample itself)
        for neighbor_idx, sim in zip(indices[idx][1:], similarities[idx][1:]):
            neighbor_idx = int(neighbor_idx)  # Convert to Python int
            
            if neighbor_idx != idx and not assigned[neighbor_idx] and sim >= threshold:
                cluster.append(neighbor_idx)
                if len(cluster) >= max_size:
                    break
        
        # Only add cluster if it meets minimum size
        if len(cluster) >= min_size:
            clusters.append(cluster)
            assigned[cluster] = True
        # If cluster too small, leave sample unassigned for now
    
    return clusters, assigned

# Usage
clusters, assigned = knn_greedy_clustering(
    tfidf_mat, 
    min_size=2, 
    max_size=4, 
    threshold=0.3,
    n_neighbors=50
)

# Verify no single-element clusters
print(f"Any size-1 clusters? {any(len(c) == 1 for c in clusters)}")
print(f"Clustered: {assigned.sum()} / {len(assigned)}")
print(f"Unassigned: {(~assigned).sum()}")

In [None]:
import numpy as np

cluster_sizes = [len(c) for c in clusters]
unique, counts = np.unique(cluster_sizes, return_counts=True)

print("Cluster size distribution:")
for size, count in zip(unique, counts):
    print(f"  Size {size}: {count} clusters")

Cluster size distribution:
  Size 2: 3101 clusters
  Size 3: 1263 clusters
  Size 4: 3239 clusters
