In [1]:
import pandas as pd
import re


In [2]:
df = pd.read_csv("UserFeedbackData.csv")
spotify_df = df[df["app_name"] == "Spotify"].copy()

spotify_df.shape


(1000, 7)

In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)      # remove URLs
    text = re.sub(r"\s+", " ", text).strip() # normalize whitespace
    return text

spotify_df["clean_content"] = spotify_df["content"].apply(clean_text)


In [4]:
# Create Severity Weight (from score)
# Invert score so low ratings carry more weight
spotify_df["severity_weight"] = 6 - spotify_df["score"]


In [5]:
# Create Impact Weight (from TU_count)
# Reviews with more likes have higher impact
spotify_df["impact_weight"] = spotify_df["TU_count"].apply(lambda x: 1 if x <= 0 else 1 + (x / spotify_df["TU_count"].max()))


In [6]:
# Down-Weight Small Sample Releases
# Versions with more reviews get higher weight
version_counts = spotify_df["RC_ver"].value_counts()

max_count = version_counts.max()

spotify_df["version_weight"] = spotify_df["RC_ver"].apply(
    lambda v: version_counts[v] / max_count
)



In [None]:
#Review-level weight
spotify_df["final_weight"] = (
    spotify_df["severity_weight"] *
    spotify_df["impact_weight"] *
    spotify_df["version_weight"]
)


In [None]:
# Display top 10 reviews by final weight
spotify_df[
    ["RC_ver", "score", "TU_count", "severity_weight", "version_weight", "final_weight"]
].sort_values("final_weight", ascending=False).head(10)


Unnamed: 0,RC_ver,score,TU_count,severity_weight,version_weight,final_weight
9391,8.8.36.522,1,704,5,1.0,5.554156
9025,8.8.36.522,1,648,5,1.0,5.510076
9503,8.8.36.522,1,413,5,1.0,5.325094
9043,8.8.36.522,1,227,5,1.0,5.178684
9034,8.8.36.522,1,222,5,1.0,5.174748
9398,8.8.36.522,1,216,5,1.0,5.170025
9049,8.8.36.522,1,147,5,1.0,5.115712
9489,8.8.36.522,1,145,5,1.0,5.114137
9084,8.8.36.522,1,117,5,1.0,5.092097
9895,8.8.36.522,1,90,5,1.0,5.070844


In [None]:
# Version-level regression score
version_health = (
    spotify_df
    .groupby("RC_ver")["final_weight"]
    .sum()
    .sort_values(ascending=False)
)

version_health


RC_ver
8.8.36.522    969.021883
8.8.40.470    754.022852
8.8.22.510    284.267623
8.8.20.544    147.910454
8.8.28.409    120.740486
8.8.26.408     60.183762
8.8.24.307     24.243685
8.8.32.508     10.182313
8.8.38.444      8.776855
8.8.34.429      1.275230
8.8.36.521      0.482683
8.8.26.56       0.018520
Name: final_weight, dtype: float64

In [14]:
spotify_df.to_csv("spotify_prepared_for_agents.csv", index=False)
