In [1]:
from pathlib import Path
import os
import subprocess

In [2]:
# ------------------ CONFIG ------------------
# Try to detect repo root
try:
    REPO_ROOT = Path(__file__).parent.parent.resolve()
except NameError:
    REPO_ROOT = Path(os.getcwd()).parent.resolve()

OUTPUTS_DIR = REPO_ROOT / "outputs"
OUTPUTS_DIR.mkdir(exist_ok=True)

COMMENTS_CSV = OUTPUTS_DIR / "comments_with_bertopic.csv"
TOPIC_SUMMARY_CSV = OUTPUTS_DIR / "bertopic_topic_summary.csv"

# TOPIC_SUMMARY_CSV = OUTPUTS_DIR / "top2vec_topic_summary.csv"
# COMMENTS_CSV = OUTPUTS_DIR / "comments_with_top2vec.csv"

In [3]:
# # ------------------ CREATE TEMP STREAMLIT SCRIPT ------------------
APP_SCRIPT = REPO_ROOT / "app/test_app.py"
APP_SCRIPT.parent.mkdir(exist_ok=True)  # ensure app folder exists

# app_code = f"""#-*- coding: utf-8 -*-
# from pathlib import Path
# import streamlit as st
# import pandas as pd

# DEFAULT_TOPIC_SUMMARY = Path(r"{TOPIC_SUMMARY_CSV}")
# DEFAULT_COMMENTS_DF = Path(r"{COMMENTS_CSV}")

# st.set_page_config(page_title="Top2Vec Explorer", layout="wide")
# st.title("Top2Vec Explorer — Notebook Test")

# @st.cache_data
# def load_csv(path: Path):
#     if not path.exists():
#         return None
#     # try a few common encodings if utf-8 fails
#     for enc in ("utf-8", "cp1252", "latin-1"):
#         try:
#             return pd.read_csv(path, encoding=enc)
#         except Exception:
#             pass
#     return None

# topic_summary = load_csv(DEFAULT_TOPIC_SUMMARY)
# comments_df = load_csv(DEFAULT_COMMENTS_DF)

# if topic_summary is None or comments_df is None:
#     st.warning("Topic summary or comments CSV not loaded.")
# else:
#     st.subheader("Topic summary")
#     st.dataframe(topic_summary.head(10))
    
#     st.subheader("Comments sample")
#     st.dataframe(comments_df.head(10))
# """

# APP_SCRIPT.write_text(app_code, encoding="utf-8")

In [4]:
app_template = r'''
# -*- coding: utf-8 -*-
from pathlib import Path
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from typing import Optional

# --- CONFIG: CSV paths injected by the notebook ---
DEFAULT_TOPIC_SUMMARY = Path(r"__TOPIC_SUMMARY__")
DEFAULT_COMMENTS_DF = Path(r"__COMMENTS_CSV__")

st.set_page_config(page_title="Topic Explorer", layout="wide", initial_sidebar_state="expanded")
st.title("Topic Explorer — Interactive Dashboard")

@st.cache_data(show_spinner=False)
def load_csv_fallback(path: Path) -> Optional[pd.DataFrame]:
    if not path.exists():
        return None
    for enc in ("utf-8", "cp1252", "latin-1"):
        try:
            df = pd.read_csv(path, encoding=enc)
            return df
        except Exception:
            continue
    return None

topic_summary = load_csv_fallback(DEFAULT_TOPIC_SUMMARY)
comments_df = load_csv_fallback(DEFAULT_COMMENTS_DF)

if topic_summary is None or comments_df is None:
    st.warning("Topic summary or comments CSV not found. Put the files in the expected paths and reload.")
    st.info(f"Expected: topic_summary={DEFAULT_TOPIC_SUMMARY}, comments={DEFAULT_COMMENTS_DF}")
    st.stop()

# --- basic normalization and convenience columns ---
TOP_WORDS_COL = "top_words_display" if "top_words_display" in topic_summary.columns else "top_words"
SAMPLE_COMMENTS_COL = "sample_comments_display" if "sample_comments_display" in topic_summary.columns else "sample_comments"
TOPIC_NUM_COL = "topic_num" if "topic_num" in topic_summary.columns else ("topic" if "topic" in topic_summary.columns else None)

if TOPIC_NUM_COL is not None:
    topic_summary[TOPIC_NUM_COL] = topic_summary[TOPIC_NUM_COL].astype(int)

DOM_COL_CANDIDATES = ["bertopic_dominant_topic", "top2vec_dominant_topic", "dominant_topic", "topic", "topic_num"]
dom_col = next((c for c in DOM_COL_CANDIDATES if c in comments_df.columns), None)
if dom_col is None:
    comments_df["bertopic_dominant_topic"] = -1
    dom_col = "bertopic_dominant_topic"
comments_df[dom_col] = comments_df[dom_col].fillna(-1).astype(int)

EMOTION_COL = "top_emotion" if "top_emotion" in comments_df.columns else None

if TOPIC_NUM_COL is not None and TOP_WORDS_COL in topic_summary.columns:
    topic_label_map = dict(zip(topic_summary[TOPIC_NUM_COL].astype(int), topic_summary[TOP_WORDS_COL].astype(str)))
else:
    topic_label_map = {}

# --- Sidebar controls ---
st.sidebar.header("Filters & display")
all_topics = sorted(set(comments_df[dom_col].unique().tolist()))
topic_choices = ["All"] + [str(t) for t in all_topics if t != -1]
selected_topics = st.sidebar.multiselect("Topics (multi-select)", options=topic_choices, default=["All"])
min_topic_size = st.sidebar.slider("Min topic size (filter topic list)",
                                   min_value=0,
                                   max_value=int(topic_summary["size"].max()) if "size" in topic_summary.columns else 100,
                                   value=0, step=1)
if EMOTION_COL:
    emotion_choices = ["All"] + sorted(comments_df[EMOTION_COL].dropna().unique().tolist())
    selected_emotion = st.sidebar.selectbox("Emotion", emotion_choices, index=0)
else:
    selected_emotion = "All"

keyword = st.sidebar.text_input("Search comments (keyword)", value="")
color_picker = st.sidebar.color_picker("Primary chart color", value="#2b8cff")
show_only_noise = st.sidebar.checkbox("Show only -1 (noise) docs", value=False)

st.sidebar.markdown("---")
st.sidebar.write("Display options")
show_sample_in_cards = st.sidebar.checkbox("Show sample text in topic cards", value=True)
cards_per_row = st.sidebar.selectbox("Cards per row", options=[1,2,3,4], index=2)

# --- Filtering logic ---
df = comments_df.copy()

if "All" not in selected_topics and len(selected_topics) > 0:
    sel = []
    for t in selected_topics:
        try:
            sel.append(int(t))
        except Exception:
            pass
    if sel:
        df = df[df[dom_col].isin(sel)]

if EMOTION_COL and selected_emotion != "All":
    df = df[df[EMOTION_COL] == selected_emotion]

if keyword:
    df = df[df.apply(lambda r: keyword.lower() in str(r.get("comment_text","")).lower() or keyword.lower() in str(r.get("sample_comments","")).lower(), axis=1)]

if show_only_noise:
    df = df[df[dom_col] == -1]

if "size" in topic_summary.columns and min_topic_size > 0:
    allowed = topic_summary[topic_summary["size"].astype(int) >= min_topic_size][TOPIC_NUM_COL].astype(int).tolist()
    df = df[df[dom_col].isin(allowed)]

# --- KPIs row ---
k1, k2, k3 = st.columns([1,1,2])
k1.metric("Filtered comments", f"{len(df):,}")
k2.metric("Topics represented", f"{df[dom_col].nunique()}")
top_emotion = df[EMOTION_COL].value_counts().idxmax() if EMOTION_COL and not df[EMOTION_COL].empty else "N/A"
k3.metric("Top emotion (filtered)", top_emotion)

st.markdown("---")

# --- Main charts ---
c1, c2 = st.columns([2,1])

with c1:
    st.subheader("Topic sizes")
    topic_counts = df[dom_col].value_counts().reset_index()
    topic_counts.columns = [ "topic", "count"]
    topic_counts["label"] = topic_counts["topic"].map(lambda t: topic_label_map.get(int(t), f"Topic {t}"))
    fig = px.bar(topic_counts.sort_values("count"), x="count", y="label", orientation="h",
                 color="count", color_continuous_scale=[color_picker, "#e6f2ff"], text="count", height=420)
    fig.update_layout(coloraxis_showscale=False, margin=dict(l=10,r=10,t=40,b=10))
    st.plotly_chart(fig, use_container_width=True)

with c2:
    st.subheader("Emotion distribution")
    if EMOTION_COL:
        ec = df[EMOTION_COL].value_counts().reset_index()
        ec.columns = ["emotion", "count"]
        fig2 = px.pie(ec, names="emotion", values="count", color_discrete_sequence=px.colors.qualitative.Pastel)
        fig2.update_traces(textposition='inside', textinfo='percent+label')
        st.plotly_chart(fig2, use_container_width=True)
    else:
        st.info("No emotion column found. Run sentiment/emotion pipeline to enable this chart.")

st.markdown("---")

# --- Topic cards / grid ---
st.subheader("Topic previews")
display_topics = topic_summary.copy()
if "size" in display_topics.columns:
    display_topics = display_topics[display_topics["size"].astype(int) >= min_topic_size]
if "size" in display_topics.columns:
    display_topics = display_topics.sort_values("size", ascending=False)
elif TOPIC_NUM_COL:
    display_topics = display_topics.sort_values(TOPIC_NUM_COL)

cards = st.columns(cards_per_row)
for idx, row in display_topics.head(30).iterrows():
    col = cards[idx % cards_per_row]
    with col:
        tnum = int(row[TOPIC_NUM_COL]) if TOPIC_NUM_COL else idx
        size = int(row["size"]) if "size" in row and str(row["size"]).isdigit() else "?"
        top_words = row.get(TOP_WORDS_COL, row.get("top_words", ""))
        st.markdown(f"### Topic {tnum} — {size} docs")
        st.markdown(f"**Top:** `{top_words}`")
        if show_sample_in_cards:
            sample = row.get(SAMPLE_COMMENTS_COL, row.get("sample_comments", ""))
            if sample:
                st.caption(sample[:300] + ("..." if len(sample) > 300 else ""))
        if st.button(f"View comments (topic {tnum})", key=f"view_{tnum}"):
            sub = df[df[dom_col] == tnum]
            if sub.empty:
                st.info("No comments for this topic (in current filters).")
            else:
                st.write(f"Showing {len(sub)} comments for topic {tnum}")
                st.dataframe(sub[["comment_id","comment_text"]].head(200), use_container_width=True)

st.markdown("---")

# --- Comment browser and download ---
st.subheader("Browse filtered comments")
preview_cols = ["comment_id","comment_text", dom_col]
if EMOTION_COL:
    preview_cols += [EMOTION_COL, "top_emotion_score"] if "top_emotion_score" in comments_df.columns else [EMOTION_COL]
available_cols = [c for c in preview_cols if c in df.columns]
st.dataframe(df[available_cols].head(500), height=400)

csv_bytes = df.to_csv(index=False).encode("utf-8")
st.download_button("Download filtered comments", csv_bytes, file_name="filtered_comments.csv", mime="text/csv")

st.markdown("---")
st.caption("Tip: use the controls on the left to filter topics, emotions, and keywords. Click topic cards to view related comments.")
'''

app_code = app_template.replace("__TOPIC_SUMMARY__", str(TOPIC_SUMMARY_CSV)).replace("__COMMENTS_CSV__", str(COMMENTS_CSV))

# write file
APP_SCRIPT.write_text(app_code, encoding="utf-8")
print("Wrote:", APP_SCRIPT)

Wrote: C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\app\test_app.py


In [None]:
# ------------------ RUN STREAMLIT ------------------
# This will open the app in your default browser
subprocess.run(["streamlit", "run", str(APP_SCRIPT)])