In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv("twitter_data_full.csv")

# Remove all posts where value = NaN
data = data.dropna(subset=["clean_post"])

# Make sure all posts are strings
data["clean_post"] = data["clean_post"].astype(str)

# Removes all posts that are empty
data = data[data["clean_post"].str.strip() != ""]

# Removes posts that are less than 5 words
data = data[data["clean_post"].str.split().str.len() > 5].reset_index(drop=True)

# Create TF-IDF Vector with parameters lowercase and stop_words
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english'
)

# Get Values and Terms
X = vectorizer.fit_transform(data["clean_post"])
terms = vectorizer.get_feature_names_out()
print("Number of features:", len(terms))
print("Remaining rows:", len(data))

  from pandas.core import (


Number of features: 22582
Remaining rows: 11986


In [2]:
# Top 5 TF-IDF Scores
top = 5

# Convert the TF-IDF matrix to Compressed Sparse Row (CSR) format
X_csr = X.tocsr()

# Create an array that maps each non-zero TF-IDF value to its document (row) index
row_idx = np.repeat(np.arange(X_csr.shape[0]), np.diff(X_csr.indptr))

# Create dataframe consisting of document index, term, and tf-idf score
df = pd.DataFrame({
    "doc_idx": row_idx,
    "term": terms[X_csr.indices],
    "tfidf": X_csr.data
})

# Filter dataframe to tf-idf scores less than 0.990
df = df[df["tfidf"] < 0.990]

# Compute the number of non-zero TF-IDF entries per document (nnz = number of non-zero values)
nnz_per_doc = np.diff(X_csr.indptr)  

# Keep only documents with more than one informative term
valid_docs = np.where(nnz_per_doc > 1)[0]

# Filter TF-IDF results to retain only valid documents
df = df[df["doc_idx"].isin(valid_docs)]


best_per_term = (
    df.sort_values("tfidf", ascending=False)
       .drop_duplicates("term")
       .head(top)
       .copy()
)

pd.set_option("display.max_colwidth", None)

best_per_term["clean_post"] = data.iloc[best_per_term["doc_idx"]]["clean_post"].values

best_per_term

Unnamed: 0,doc_idx,term,tfidf,clean_post
22450,2646,glad,0.989051,im glad youre glad im glad glad glad glad glad glad
78049,9224,hardly,0.968832,rt hardly hardly hardly forget anything
38169,4560,wop,0.960664,play dead like wop wop wop strike pose like wop wop wop
60958,7281,taco,0.933337,real taco much better taco bell taco
44808,5352,bed,0.925244,sad thing bed bed side bed everything
