In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from collections import Counter

df = pd.read_csv("cleaned_transcripts.csv")
documents = df["cleaned_text"].dropna().tolist()


In [32]:
vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english', max_features=5000)
X = vectorizer.fit_transform(documents)
X_norm = normalize(X)

In [34]:
num_docs = len(documents)
num_topics = min(10, num_docs)
kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_norm)


In [35]:
feature_names = vectorizer.get_feature_names_out()
topic_keywords = []

print(f"\n🔹 Total topics found: {num_topics}\n")
for i in range(num_topics):
    center = kmeans.cluster_centers_[i]
    top_indices = center.argsort()[-5:][::-1]
    top_words = [feature_names[j] for j in top_indices]
    topic_keywords.append(top_words)
    print(f"🔸 Topic {i}: {top_words}")


🔹 Total topics found: 5

🔸 Topic 0: ['said', 'therapy', 'therapist', 'men', 'doctor']
🔸 Topic 1: ['shit', 'beautiful', 'world', 'indian', 'india']
🔸 Topic 2: ['jokes', 'bank', 'government', 'scared', 'issues']
🔸 Topic 3: ['laughing', 'gonna', 'man', 'woman', 'shit']
🔸 Topic 4: ['fuck', 'pretty', 'guy', 'seriously', 'tell']


In [37]:
topic_counts = Counter(labels)
top_topics = topic_counts.most_common(5)

print("\n🔹 Top 5 most common topics across documents:\n")
for rank, (topic_num, count) in enumerate(top_topics, start=1):
    print(f"🔸 Rank {rank} — Topic {topic_num} : {topic_keywords[topic_num]}")


🔹 Top 5 most common topics across documents:

🔸 Rank 1 — Topic 2 : ['jokes', 'bank', 'government', 'scared', 'issues']
🔸 Rank 2 — Topic 0 : ['said', 'therapy', 'therapist', 'men', 'doctor']
🔸 Rank 3 — Topic 4 : ['fuck', 'pretty', 'guy', 'seriously', 'tell']
🔸 Rank 4 — Topic 1 : ['shit', 'beautiful', 'world', 'indian', 'india']
🔸 Rank 5 — Topic 3 : ['laughing', 'gonna', 'man', 'woman', 'shit']


In [38]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define topic label candidates
label_candidates = [
    "Politics", "Mental Health", "Relationships", "Comedy", "Culture", "Gender",
    "Religion", "Sexuality", "Education", "Work", "Finance", "Society",
    "Science", "Parenting", "Travel", "Technology"
]

# Convert label candidates into embeddings
label_embeddings = model.encode(label_candidates)

# Build topic phrases from keywords
topic_phrases = [' '.join(keywords) for keywords in topic_keywords]
topic_embeddings = model.encode(topic_phrases)

# Match each topic to the best label
topic_labels = []
for i, emb in enumerate(topic_embeddings):
    sims = cosine_similarity([emb], label_embeddings)[0]
    best_label = label_candidates[sims.argmax()]
    topic_labels.append(best_label)

# Print final labeled topics
print("\n🏷️ Auto-labeled Topics:\n")
for i, (keywords, label) in enumerate(zip(topic_keywords, topic_labels)):
    print(f"🧩 Topic {i} — {label}: {keywords}")



🏷️ Auto-labeled Topics:

🧩 Topic 0 — Mental Health: ['said', 'therapy', 'therapist', 'men', 'doctor']
🧩 Topic 1 — Culture: ['shit', 'beautiful', 'world', 'indian', 'india']
🧩 Topic 2 — Comedy: ['jokes', 'bank', 'government', 'scared', 'issues']
🧩 Topic 3 — Gender: ['laughing', 'gonna', 'man', 'woman', 'shit']
🧩 Topic 4 — Sexuality: ['fuck', 'pretty', 'guy', 'seriously', 'tell']
