<a href="https://colab.research.google.com/github/ShannonBonilla/COMM557_Project/blob/main/BER%2BTiktok_network_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================================================
# TikTok Song Topic & Network Analysis (Final Fixed Version)
# ==========================================================

import pandas as pd
import re
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
from networkx.algorithms import community
from matplotlib.patches import Patch
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os

# ----------------------------------------------------------
# Step 0. Config
# ----------------------------------------------------------
LABELED_PATH = "dataset_with_topics_FINAL.csv"   # labeled file with topic_number/topic_label
FALLBACK_PATH = "dataset_with_topics.csv"        # fallback if labeled file not found

# ----------------------------------------------------------
# Step 1. Load data (prefer labeled CSV)
# ----------------------------------------------------------
if os.path.exists(LABELED_PATH):
    df = pd.read_csv(LABELED_PATH)
    print(f"Loaded labeled dataset: {LABELED_PATH} ({len(df)} rows)")
elif os.path.exists(FALLBACK_PATH):
    df = pd.read_csv(FALLBACK_PATH)
    print(f"WARNING: Using fallback dataset without labels: {FALLBACK_PATH} ({len(df)} rows)")
else:
    raise FileNotFoundError("No dataset found. Provide dataset_with_topics_FINAL.csv")

# Basic sanity
if "lyrics_missing" not in df.columns and "lyrics" in df.columns:
    df["lyrics_missing"] = df["lyrics"].isnull()

# Standardize topic column name
if "topic_number" in df.columns and "topic" not in df.columns:
    df.rename(columns={"topic_number": "topic"}, inplace=True)

# Convert topic to integer (handles cases like 0.0)
if "topic" in df.columns:
    df["topic"] = pd.to_numeric(df["topic"], errors="coerce").astype("Int64")
    # Drop rows where topic could not be parsed
    bad_before = len(df)
    df = df[df["topic"].notna()].copy()
    df["topic"] = df["topic"].astype(int)
    if bad_before - len(df) > 0:
        print(f"Dropped {bad_before - len(df)} rows with invalid topic values")

# Standardize label column (auto-detect)
label_col_candidates = [c for c in df.columns if c.lower() in ("topic_label", "topicname", "topic_name", "label")]
if len(label_col_candidates) > 0 and "topic_name" not in df.columns:
    df.rename(columns={label_col_candidates[0]: "topic_name"}, inplace=True)

# Drop BERTopic outliers if present
if "topic" in df.columns:
    before = len(df)
    df = df[df["topic"] != -1].copy()
    print(f"Dropped {before - len(df)} rows with topic = -1")

# If still no human-readable labels, create placeholders
if "topic_name" not in df.columns:
    print("No topic label column detected — creating placeholder labels.")
    df["topic_name"] = "Topic " + df["topic"].astype(str)

# Build topic id -> label mapping (most frequent label per topic)
topic_label_map = (
    df[["topic", "topic_name"]]
    .dropna()
    .groupby("topic")["topic_name"]
    .agg(lambda s: s.value_counts().idxmax())
    .to_dict()
)
print(f"Topic label map created for {len(topic_label_map)} topics")

# ----------------------------------------------------------
# Step 2. Subset to TikTok
# ----------------------------------------------------------
if "source" in df.columns:
    tiktok = df.query("source == 'tiktok' and lyrics_missing == False").copy()
else:
    print("No 'source' column found. Using all rows with lyrics.")
    tiktok = df[df.get("lyrics_missing", False) == False].copy()

if len(tiktok) == 0:
    print("No TikTok subset found. Using all data.")
    tiktok = df.copy()

print(f"TikTok subset size: {len(tiktok)}")

# Create stable ids
tiktok["song_id"] = tiktok.reset_index().index.astype(str)
tiktok["topic_id"] = "T" + tiktok["topic"].astype(int).astype(str)

# ----------------------------------------------------------
# Step 3. Build song-song network by shared topic
# ----------------------------------------------------------
song2topics = tiktok.groupby("song_id")["topic_id"].apply(set).to_dict()
edges = []
for s1, s2 in combinations(song2topics.keys(), 2):
    shared = song2topics[s1] & song2topics[s2]
    if shared:
        edges.append((s1, s2, {"weight": len(shared)}))

G = nx.Graph()
G.add_edges_from(edges)
G.remove_nodes_from(list(nx.isolates(G)))
print(f"Weighted song network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# ----------------------------------------------------------
# Step 4. Community detection and naming
# ----------------------------------------------------------
if G.number_of_nodes() > 0:
    comms = community.greedy_modularity_communities(G, weight="weight")
    print(f"Detected {len(comms)} communities")

    # Map song -> community index (0-based internal)
    song_to_comm0 = {}
    for i, comm in enumerate(comms):
        for s in comm:
            song_to_comm0[s] = i
    tiktok["community0"] = tiktok["song_id"].map(song_to_comm0)

    # Build display names with 1-based ids and human-readable topic labels
    community_names = {}  # 1-based id -> display name
    for i, comm in enumerate(comms, start=1):
        comm_rows = tiktok[tiktok["community0"] == (i - 1)]
        top_topics = comm_rows["topic_id"].value_counts().head(3)
        labels = []
        for tid in top_topics.index:
            # tid like "T3" -> 3
            tnum = int(str(tid).replace("T", ""))
            label = topic_label_map.get(tnum, f"Topic {tnum}")
            labels.append(label)
        name = ", ".join(labels) if labels else "Mixed Themes"
        community_names[i] = f"Community {i}: {name}"

    print("\nCommunity names:")
    for i in sorted(community_names):
        print(f"- {community_names[i]}")
else:
    comms = []
    community_names = {}
    print("No communities detected; skipping naming.")

# ----------------------------------------------------------
# Step 5. Visualization (legend shows top-8 largest communities)
# ----------------------------------------------------------
if G.number_of_nodes() > 0:
    plt.figure(figsize=(14, 12))
    pos = nx.spring_layout(G, seed=42, k=0.5, iterations=50)

    # Color by internal 0-based community id
    color_map = plt.cm.Set3(range(len(comms))) if len(comms) > 0 else None
    node_colors = [color_map[song_to_comm0.get(n, 0)] for n in G.nodes()] if color_map is not None else "lightgray"

    nx.draw_networkx_nodes(G, pos, node_size=50, node_color=node_colors, alpha=0.8)
    nx.draw_networkx_edges(G, pos, alpha=0.2, width=0.5, edge_color="gray")

    plt.title("TikTok Song Network by Shared Lyrical Themes", fontsize=16)
    plt.axis("off")

    # Legend for top-8 largest communities (1-based display ids)
    if len(comms) > 0:
        sizes = {i + 1: len(comms[i]) for i in range(len(comms))}
        largest = sorted(sizes, key=sizes.get, reverse=True)[:8]
        legend_elements = [
            Patch(facecolor=color_map[i - 1], label=f"{community_names[i]} (n={sizes[i]})", alpha=0.8)
            for i in largest
        ]
        plt.legend(handles=legend_elements, loc="upper left", fontsize=9, framealpha=0.9, title="Lyrical Communities")

    plt.tight_layout()
    plt.savefig("tiktok_song_network.png", dpi=300, bbox_inches="tight")
    plt.show()

    # Export GEXF for Gephi
    nx.write_gexf(G, "tiktok_weighted_song_network.gexf")
    print("Exported: tiktok_weighted_song_network.gexf and tiktok_song_network.png")
else:
    print("Network is empty. Visualization skipped.")

# ----------------------------------------------------------
# Step 6. RQ1 regression placeholders (run once chart/streaming columns are present)
# ----------------------------------------------------------
print("\nRegression preparation...")

df_reg = df.copy()
chart_vars = [c for c in df_reg.columns if c.lower() in ("peak_rank", "weeks_on_chart")]
stream_vars = [c for c in df_reg.columns if ("stream" in c.lower()) or ("play" in c.lower()) or ("view" in c.lower())]

if not chart_vars and not stream_vars:
    print("No chart or streaming variables found yet. Add them and re-run this section.")
else:
    print(f"Chart vars: {chart_vars}")
    print(f"Streaming vars: {stream_vars}")

    # Predictors: topic one-hot + basic audio features
    needed = ["topic", "danceability", "energy", "loudness", "tempo"]
    for col in needed:
        if col not in df_reg.columns:
            raise ValueError(f"Missing required column for regression: {col}")

    # Ensure topic is integer for encoding
    df_reg["topic"] = pd.to_numeric(df_reg["topic"], errors="coerce").astype("Int64")
    df_reg = df_reg[df_reg["topic"].notna()].copy()
    df_reg["topic"] = df_reg["topic"].astype(int)

    X = df_reg[needed].dropna()
    enc = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    topic_encoded = enc.fit_transform(X[["topic"]])
    topic_cols = [f"topic_{t}" for t in enc.categories_[0]]
    topic_df = pd.DataFrame(topic_encoded, columns=topic_cols, index=X.index)
    X = pd.concat([X.drop(columns=["topic"]), topic_df], axis=1)

    # Run OLS for each available dependent variable
    for target in chart_vars + stream_vars:
        if target not in df_reg.columns:
            continue
        y = df_reg.loc[X.index, target]
        y = pd.to_numeric(y, errors="coerce")
        mask = ~y.isna()
        if mask.sum() < 30:
            print(f"Not enough rows for {target} (n={mask.sum()}). Skipping.")
            continue
        X_ = sm.add_constant(X.loc[mask])
        y_ = y.loc[mask]
        model = sm.OLS(y_, X_).fit()
        print(f"\nRegression on {target}:")
        print(model.summary().tables[1])

print("\nDone.")

FileNotFoundError: No dataset found. Provide dataset_with_topics_FINAL.csv