<a href="https://colab.research.google.com/github/ShannonBonilla/COMM557_Project/blob/main/BER%2BTiktok_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================
# TikTok Song Topic & Network Analysis (FIXED VERSION)
# ==========================================================

# --- Install dependencies (if running in Colab) ---
# !pip install pandas bertopic sentence-transformers hdbscan langdetect networkx matplotlib
!pip install langdetect --quiet
!pip install bertopic

# --- Imports ---
import pandas as pd
import re
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from itertools import combinations
from langdetect import detect, DetectorFactory
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import hdbscan

DetectorFactory.seed = 0  # make results reproducible

# ==========================================================
# STEP 1. Load data
# ==========================================================
df = pd.read_csv("final_songs_with_lyrics.csv")

print(f"Number of unique songs: {df['track_name'].nunique()}")
print(f"Number of unique artists: {df['artist_name'].nunique()}")

audio_features = ['danceability', 'energy', 'loudness', 'tempo', 'duration_ms']
print(df[audio_features].describe())

# Check missing lyrics
df['lyrics_missing'] = df['lyrics'].isnull()
df_with_lyrics = df.dropna(subset=['lyrics']).copy()
print(f"Original df size: {df.shape[0]}")
print(f"df_with_lyrics size: {df_with_lyrics.shape[0]}")

# ==========================================================
# STEP 2. Clean and filter lyrics
# ==========================================================
def detect_language_safe(text):
    try:
        return detect(text)
    except:
        return "unknown"

df_with_lyrics['language'] = df_with_lyrics['lyrics'].apply(detect_language_safe)
print(df_with_lyrics['language'].value_counts().head())

# Remove filler words and noise
FILLERS = r"(oh|yeah|ya|yea|na|la|uh|woo|ooh|ah|ha|hey|baby|girl|boy)"
FILLER_SEQ = re.compile(rf"\b(?:{FILLERS})(?:\s+\1){{1,}}\b", re.IGNORECASE)

def clean_lyric(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = re.sub(r"[\[\(\{].*?[\]\)\}]", " ", s)
    s = s.lower().replace("'", "'")
    s = re.sub(r"[^\w\s']", " ", s)
    s = re.sub(r"'", "", s)
    s = FILLER_SEQ.sub(lambda m: " " + " ".join(sorted(set(m.group(0).split()))) + " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

docs = df_with_lyrics['lyrics'].tolist()
clean_docs = [clean_lyric(d) for d in docs if isinstance(d, str)]
clean_docs = [d for d in clean_docs if len(d.split()) >= 5]
print(f"Kept {len(clean_docs)} cleaned lyrics")

# ✅ FIX: Filter df_with_lyrics to match cleaned docs length
# Precompute cleaned lyrics to avoid redundant cleaning
df_with_lyrics['cleaned_text'] = df_with_lyrics['lyrics'].apply(clean_lyric)
mask = df_with_lyrics['cleaned_text'].apply(lambda x: len(x.split()) >= 5)
df_cleaned = df_with_lyrics[mask].copy()
print(f"df_cleaned size: {len(df_cleaned)} (matches {len(clean_docs)} docs)")

# ✅ Verify consistency
assert len(df_cleaned) == len(clean_docs), "Mismatch between df_cleaned and clean_docs!"

# ==========================================================
# STEP 3. Topic modeling with BERTopic
# ==========================================================
print("\n🔄 Starting topic modeling...")

embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# ✅ Comprehensive multilingual stopwords
comprehensive_stopwords = [
    # English
    "you", "your", "the", "and", "to", "me", "it", "my", "im", "on", "in", "of", "for", "we", "i", "a", "be", "is", "are", "was", "were",
    "oh", "yeah", "na", "la", "uh", "mm", "mmm", "ooh", "woah", "hey", "just", "like", "got", "get", "let", "ill", "youre", "ive", "dont", "cant", "aint",
    # Spanish
    "que", "de", "no", "me", "te", "en", "el", "lo", "yo", "tu", "mi", "y", "un", "una", "con", "por", "si", "se", "las", "los", "del", "al", "es", "son", "sta", "estoy",
    # Portuguese
    "eu", "um", "uma", "não", "do", "da", "em", "no", "na", "com", "pra", "tá", "tô", "pra", "por", "para",
    # French
    "je", "tu", "il", "elle", "nous", "vous", "ils", "elles", "un", "une", "des", "le", "la", "les", "et", "ou", "mais", "que", "qui",
    # German
    "ich", "du", "er", "sie", "wir", "ihr", "sie", "es", "der", "die", "das", "den", "dem", "ein", "eine",
    # Common short words to skip
    "way", "get", "got", "make", "see", "know"
]

vectorizer_model = CountVectorizer(
    stop_words=comprehensive_stopwords,
    ngram_range=(1, 2),
    min_df=5,
    token_pattern=r"(?u)\b[a-zA-ZáéíóúñüçğşıöÆØÅæøåÄÖÜß]{4,}\b"  # Skip tokens < 4 chars
)

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=4,
    min_samples=1,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    nr_topics=20,
    calculate_probabilities=False,
    verbose=True
)

topics, probs = topic_model.fit_transform(clean_docs)
topic_info = topic_model.get_topic_info()
print(topic_info.head(15))

# ✅ FIX: Assign topics to df_cleaned (not df_with_lyrics)
assert len(topics) == len(df_cleaned), f"Mismatch: {len(topics)} topics vs {len(df_cleaned)} rows"
df_cleaned['topic'] = topics

# ✅ FIX: Save only once with consistent data
df_cleaned.to_csv("dataset_with_topics.csv", index=False)
print(f"✅ Saved dataset_with_topics.csv with {len(df_cleaned)} rows\n")

# ==========================================================
# STEP 4. Build TikTok song network
# ==========================================================
print("🔄 Building network...")

# ✅ FIX: Check if 'source' column exists, handle gracefully
df = pd.read_csv("dataset_with_topics.csv")
print(f"Columns in dataset: {list(df.columns)}")

# Check if source column exists
if 'source' in df.columns:
    tiktok = df.query("source == 'tiktok' & lyrics_missing == False").copy()
else:
    print("⚠️  'source' column not found. Using all data instead.")
    tiktok = df[df['lyrics_missing'] == False].copy()

if len(tiktok) == 0:
    print("⚠️  No TikTok songs found. Using all available data.")
    tiktok = df.copy()

print(f"TikTok songs: {len(tiktok)}")

tiktok['song_id'] = tiktok.reset_index().index.astype(str)
tiktok['topic_id'] = 'T' + tiktok['topic'].astype(str)

# --- Bipartite Network ---
B = nx.Graph()
songs = tiktok['song_id'].unique()
topics_ = tiktok['topic_id'].unique()
B.add_nodes_from(songs, bipartite=0, node_type='song')
B.add_nodes_from(topics_, bipartite=1, node_type='topic')
B.add_edges_from(tiktok[['song_id', 'topic_id']].itertuples(index=False))

print(f"Bipartite network: {B.number_of_nodes()} nodes, {B.number_of_edges()} edges")

# --- Song-Song Weighted Network ---
song2topics = tiktok.groupby('song_id')['topic_id'].apply(set).to_dict()
edges = []

for s1, s2 in combinations(song2topics.keys(), 2):
    shared = song2topics[s1] & song2topics[s2]
    if shared:
        edges.append((s1, s2, {'weight': len(shared)}))

G = nx.Graph()
G.add_edges_from(edges)
G.remove_nodes_from(list(nx.isolates(G)))
print(f"Weighted song network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# --- Community Detection ---
from networkx.algorithms import community
song_to_community = {}  # Initialize early for visualization

if G.number_of_nodes() > 0:
    communities = community.greedy_modularity_communities(G, weight='weight')
    print(f"Detected {len(communities)} communities")

    # Print community statistics
    for i, comm in enumerate(communities):
        print(f"  Community {i}: {len(comm)} songs")

    # Create community membership mapping (BEFORE visualization)
    for i, comm in enumerate(communities):
        for song_id in comm:
            song_to_community[song_id] = i

    tiktok['community'] = tiktok['song_id'].map(song_to_community)

    # ✅ Generate community names based on top topics
    community_names = {}
    for i, comm in enumerate(communities):
        comm_songs = tiktok[tiktok['community'] == i]
        top_topics = comm_songs['topic_id'].value_counts().head(3)

        # Extract topic names for this community
        if len(top_topics) > 0:
            topic_labels = []
            for topic_id in top_topics.index:
                # Extract number from 'T0', 'T1', etc.
                topic_num = int(topic_id.replace('T', ''))
                topic_info = topic_model.get_topic(topic_num)

                if topic_info is not None and isinstance(topic_info, list) and len(topic_info) > 0:
                    # Get top word from this topic
                    top_word = topic_info[0][0]  # (word, score) tuple
                    topic_labels.append(f"{top_word}")

            if topic_labels:
                community_name = f"Community {i}: {', '.join(topic_labels[:3])}"
            else:
                community_name = f"Community {i}"
        else:
            community_name = f"Community {i}"

        community_names[i] = community_name

    # Save community analysis to CSV for reference
    community_summary = []
    for i in range(len(communities)):
        comm_songs = tiktok[tiktok['community'] == i]
        top_topics = comm_songs['topic_id'].value_counts().head(5)

        summary = {
            'community_id': i,
            'community_name': community_names[i],
            'num_songs': len(comm_songs),
            'avg_popularity': comm_songs['popularity'].mean() if 'popularity' in tiktok.columns else None,
            'top_topics': ', '.join(top_topics.index.tolist()),
            'topic_distribution': ', '.join([f"{t}({c})" for t, c in top_topics.items()])
        }
        community_summary.append(summary)

    community_df = pd.DataFrame(community_summary)
    community_df.to_csv("community_analysis.csv", index=False)
    print("\n✅ Community analysis saved to community_analysis.csv")
    print(community_df.to_string())

    # ✅ RQ1 ANALYSIS: Link communities to streaming success
    print("\n" + "="*70)
    print("RQ1 ANALYSIS: Community Characteristics & Streaming Success")
    print("="*70)

    # Check what streaming metrics are available
    streaming_cols = [col for col in tiktok.columns if 'stream' in col.lower() or 'play' in col.lower() or 'view' in col.lower()]
    print(f"\n📊 Available streaming metrics: {streaming_cols}")

    if len(streaming_cols) > 0:
        # Analyze streaming success by community
        print(f"\n📈 Streaming Success by Community:")
        community_stats = []

        for i, comm in enumerate(communities):
            comm_songs = tiktok[tiktok['community'] == i]

            # Collect stats
            stats = {
                'community': i,
                'num_songs': len(comm),
                'avg_popularity': comm_songs['popularity'].mean() if 'popularity' in tiktok.columns else None,
            }

            # Add streaming metrics if available
            for col in streaming_cols:
                if col in tiktok.columns:
                    stats[f'avg_{col}'] = comm_songs[col].mean()

            community_stats.append(stats)

            print(f"\n  Community {i}:")
            print(f"    - Songs: {len(comm)}")
            if 'popularity' in tiktok.columns:
                print(f"    - Avg Popularity: {stats['avg_popularity']:.2f}")
            for col in streaming_cols:
                if col in tiktok.columns:
                    print(f"    - Avg {col}: {stats[f'avg_{col}']:.2e}")

            # Show top topics in this community
            top_topics = comm_songs['topic_id'].value_counts().head(3)
            print(f"    - Top Topics: {', '.join([f'{t}({c})' for t,c in top_topics.items()])}")

        # Rank communities by success
        if 'popularity' in tiktok.columns:
            ranked = sorted(community_stats, key=lambda x: x['avg_popularity'], reverse=True)
            print(f"\n🏆 Communities ranked by average popularity:")
            for rank, comm_stat in enumerate(ranked, 1):
                print(f"  {rank}. Community {comm_stat['community']}: {comm_stat['avg_popularity']:.2f}")

    else:
        print("\n⚠️  No streaming success metrics found in dataset")
        print("   Available columns:", list(tiktok.columns)[:10])
else:
    print("⚠️  Network has no nodes. Skipping community detection.")

# --- Visualization ---
if G.number_of_nodes() > 0:
    plt.figure(figsize=(14, 12))
    pos = nx.spring_layout(G, seed=42, k=0.5, iterations=50)

    # ✅ Color nodes by community
    node_colors = []
    color_map = plt.cm.Set3(range(len(communities)))

    for node in G.nodes():
        # Find which community this node belongs to
        node_community = song_to_community.get(node, -1)
        if node_community >= 0:
            node_colors.append(color_map[node_community])
        else:
            node_colors.append('lightgray')

    # Draw with community colors
    nx.draw_networkx_nodes(G, pos, node_size=50, alpha=0.8, node_color=node_colors)
    nx.draw_networkx_edges(G, pos, alpha=0.2, width=0.5, edge_color='gray')

    plt.title("TikTok Song Network by Shared Topics\n(Colored by Community)",
              fontsize=16, fontweight='bold', pad=20)

    # Add legend with community names
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=color_map[i], alpha=0.8,
                            label=f'{community_names[i]} (n={len(communities[i])})')
                       for i in range(len(communities))]
    plt.legend(handles=legend_elements, loc='upper left', fontsize=9, framealpha=0.9)

    plt.axis("off")
    plt.tight_layout()
    plt.savefig("tiktok_song_network.png", dpi=300, bbox_inches='tight')
    plt.show()
    print("✅ Network visualization saved as tiktok_song_network.png")
else:
    print("⚠️  Cannot visualize: network is empty")

# --- Save Network Outputs ---
if G.number_of_nodes() > 0:
    nx.write_gexf(G, "tiktok_weighted_song_network.gexf")
    print("✅ Network GEXF file saved as tiktok_weighted_song_network.gexf")

print("\n✅ All steps completed successfully! 🎉")

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


FileNotFoundError: [Errno 2] No such file or directory: 'final_songs_with_lyrics.csv'