In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from collections import Counter

df = pd.read_csv("cleaned_transcripts.csv")
documents = df["cleaned_text"].dropna().tolist()


In [32]:
vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english', max_features=5000)
X = vectorizer.fit_transform(documents)
X_norm = normalize(X)

In [34]:
num_docs = len(documents)
num_topics = min(10, num_docs)
kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_norm)


In [35]:
feature_names = vectorizer.get_feature_names_out()
topic_keywords = []

print(f"\nðŸ”¹ Total topics found: {num_topics}\n")
for i in range(num_topics):
    center = kmeans.cluster_centers_[i]
    top_indices = center.argsort()[-5:][::-1]
    top_words = [feature_names[j] for j in top_indices]
    topic_keywords.append(top_words)
    print(f"ðŸ”¸ Topic {i}: {top_words}")


ðŸ”¹ Total topics found: 5

ðŸ”¸ Topic 0: ['said', 'therapy', 'therapist', 'men', 'doctor']
ðŸ”¸ Topic 1: ['shit', 'beautiful', 'world', 'indian', 'india']
ðŸ”¸ Topic 2: ['jokes', 'bank', 'government', 'scared', 'issues']
ðŸ”¸ Topic 3: ['laughing', 'gonna', 'man', 'woman', 'shit']
ðŸ”¸ Topic 4: ['fuck', 'pretty', 'guy', 'seriously', 'tell']


In [36]:
topic_counts = Counter(labels)
top_topics = topic_counts.most_common(5)

print("\nðŸ”¹ Top 5 most common topics across documents:\n")
for rank, (topic_num, count) in enumerate(top_topics, start=1):
    print(f"ðŸ”¸ Rank {rank} â€” Topic {topic_num} ({count} docs): {topic_keywords[topic_num]}")


ðŸ”¹ Top 5 most common topics across documents:

ðŸ”¸ Rank 1 â€” Topic 2 (1 docs): ['jokes', 'bank', 'government', 'scared', 'issues']
ðŸ”¸ Rank 2 â€” Topic 0 (1 docs): ['said', 'therapy', 'therapist', 'men', 'doctor']
ðŸ”¸ Rank 3 â€” Topic 4 (1 docs): ['fuck', 'pretty', 'guy', 'seriously', 'tell']
ðŸ”¸ Rank 4 â€” Topic 1 (1 docs): ['shit', 'beautiful', 'world', 'indian', 'india']
ðŸ”¸ Rank 5 â€” Topic 3 (1 docs): ['laughing', 'gonna', 'man', 'woman', 'shit']
