In [1]:
import pandas as pd
import numpy as np

df_exploded = pd.read_csv("spotify_reviews_multilabel.csv")


In [2]:
theme_version_signal = (
    df_exploded
    .groupby(["RC_ver", "assigned_themes"])
    .agg(
        Theme_Weight=("final_weight", "sum"),
        Review_Count=("content", "count"),
        Avg_Rating=("score", "mean")
    )
    .reset_index()
    .rename(columns={"assigned_themes": "theme"})
)


In [3]:
version_totals = (
    theme_version_signal
    .groupby("RC_ver")["Theme_Weight"]
    .sum()
    .reset_index(name="Version_Total_Weight")
)

theme_version_signal = theme_version_signal.merge(
    version_totals, on="RC_ver"
)

theme_version_signal["Normalized_Signal"] = (
    theme_version_signal["Theme_Weight"] /
    theme_version_signal["Version_Total_Weight"]
)


In [4]:
theme_version_signal = theme_version_signal.sort_values(
    by="RC_ver"
)


In [5]:
theme_version_signal["Prev_Signal"] = (
    theme_version_signal
    .groupby("theme")["Normalized_Signal"]
    .shift(1)
)

theme_version_signal["Delta"] = (
    theme_version_signal["Normalized_Signal"] -
    theme_version_signal["Prev_Signal"]
)


In [6]:
REGRESSION_THRESHOLD = 0.05  # 5% increase in pain share

theme_version_signal["Is_Regression"] = (
    theme_version_signal["Delta"] > REGRESSION_THRESHOLD
)


In [7]:
PERSISTENCE_THRESHOLD = 0.15   # 15% of version pain
MIN_RELEASES = 3


In [8]:
theme_persistence = (
    theme_version_signal
    .assign(Above_Threshold=lambda d: d["Normalized_Signal"] > PERSISTENCE_THRESHOLD)
    .groupby("theme")["Above_Threshold"]
    .sum()
    .reset_index(name="High_Signal_Release_Count")
)

theme_persistence["Is_Persistent"] = (
    theme_persistence["High_Signal_Release_Count"] >= MIN_RELEASES
)


In [9]:
theme_health = theme_persistence.merge(
    theme_version_signal,
    on="theme",
    how="left"
)


In [10]:
theme_version_signal.to_csv("theme_version_signal.csv", index=False)
theme_persistence.to_csv("theme_persistence.csv", index=False)
