In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv("reddit.csv")

# Remove all posts where value = NaN
data = data.dropna(subset=["text"])

# Make sure all posts are strings
data["text"] = data["text"].astype(str)

# Removes all posts that are empty
data = data[data["text"].str.strip() != ""]

# Removes posts that are less than 5 words
data = data[data["text"].str.split().str.len() > 5].reset_index(drop=True)

# Create TF-IDF Vector with parameters lowercase and stop_words
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english'
)

# Get Values and Terms
X = vectorizer.fit_transform(data["text"])
terms = vectorizer.get_feature_names_out()
print("Number of features:", len(terms))
print("Remaining rows:", len(data))

  from pandas.core import (


Number of features: 35311
Remaining rows: 20322


In [2]:
# Top 5 TF-IDF Scores
top = 5

# Convert the TF-IDF matrix to Compressed Sparse Row (CSR) format
X_csr = X.tocsr()


# Create an array that maps each non-zero TF-IDF value to its document (row) index
row_idx = np.repeat(np.arange(X_csr.shape[0]), np.diff(X_csr.indptr))

# Create dataframe consisting of document index, term, and tf-idf score
df = pd.DataFrame({
    "doc_idx": row_idx,
    "term": terms[X_csr.indices],
    "tfidf": X_csr.data
})

# Filter dataframe to tf-idf scores less than 0.990
df = df[df["tfidf"] < 0.990]

# Compute the number of non-zero TF-IDF entries per document (nnz = number of non-zero values)
nnz_per_doc = np.diff(X_csr.indptr)  

# Keep only documents with more than one informative term
valid_docs = np.where(nnz_per_doc > 1)[0] 

# Filter TF-IDF results to retain only valid documents
df = df[df["doc_idx"].isin(valid_docs)]

best_per_term = (
    df.sort_values("tfidf", ascending=False)
       .drop_duplicates("term")
       .head(top)
       .copy()
)

pd.set_option("display.max_colwidth", None)

best_per_term["text"] = data.iloc[best_per_term["doc_idx"]]["text"].values

best_per_term

Unnamed: 0,doc_idx,term,tfidf,text
314062,6247,fuck,0.988987,Fuck me fuck me fuck me fuck me fuck me fuck me fuck me fuck me fuck me i cannot breathe fuck me fuck me fuck me I am useless fuck me fuck me fuck me everyone would he better off with me dead fuck me fuck me fuck me. I want to kill myself
405725,8107,closer,0.988723,the day is closer and closer. closer
363767,7240,pervert,0.98653,I am a pervert Should you kill yourself if you are a pervert?
747098,15076,clap,0.981106,If you feel you have no friends clap your hands*clap clap*If you want to disappear clap your hands*clap clap*If your life has gone to shit and you really cannot handle itIf you want to kill yourself clap your hands*clap clap* *clap clap*
272811,5422,paranoid,0.980542,I am paranoid can someone help Paranoid
