# Spotify Track Popularity Prediction

## Initialization

In [1]:
from kaggle import KaggleApi
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

In [2]:
#added
import mlflow
from mlflow.models import infer_signature

## Configuration

In [3]:
NUMERICAL_FEATURES = [
    "danceability",
    "loudness",
    "energy",
    "tempo",
    "valence",
    "speechiness",
    "liveness",
    "acousticness",
    "instrumentalness",
    "duration_ms",
    "year",
]

CATEGORICAL_FEATURES = [
    "genre",
]

TARGET = "verdict"

RANDOM_STATE = 42

## Data Ingestion

In [4]:
api = KaggleApi()
api.authenticate()

api.dataset_download_files(
    dataset="amitanshjoshi/spotify-1million-tracks", path="./data", unzip=True
)

Dataset URL: https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks


In [5]:
spotify_tracks = pd.read_csv("./data/spotify_data.csv")
spotify_tracks.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


In [6]:
mlflow.autolog(log_datasets=False)
mlflow.set_tracking_uri(uri="http://127.0.0.1:4000")
mlflow.set_experiment("song_pop")



#mlflow.log_param("Param", pipeline)
#mlflow.log_metric("Metrics", pipeline)
model_uri = mlflow.get_artifact_uri("model")

2024/12/04 17:04:01 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/12/04 17:04:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


## Data preparation

In [7]:
# Add the popularity verdict
spotify_tracks[TARGET] = spotify_tracks.apply(
    lambda row: 1 if row["popularity"] >= 50 else 0, axis=1
)

In [8]:
feature_columns = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
features = spotify_tracks[feature_columns + [TARGET]]
features.head()

Unnamed: 0,danceability,loudness,energy,tempo,valence,speechiness,liveness,acousticness,instrumentalness,duration_ms,year,genre,verdict
0,0.483,-10.058,0.303,133.406,0.139,0.0429,0.115,0.694,0.0,240166,2012,acoustic,1
1,0.572,-10.286,0.454,140.182,0.515,0.0258,0.0974,0.477,1.4e-05,216387,2012,acoustic,1
2,0.409,-13.711,0.234,139.832,0.145,0.0323,0.0895,0.338,5e-05,158960,2012,acoustic,1
3,0.392,-9.845,0.251,204.961,0.508,0.0363,0.0797,0.807,0.0,304293,2012,acoustic,1
4,0.43,-5.419,0.791,171.864,0.217,0.0302,0.11,0.0726,0.0193,244320,2012,acoustic,1


In [9]:
train_data, test_data = train_test_split(features, random_state=RANDOM_STATE)

train_input = train_data[feature_columns]
train_output = train_data[TARGET]

train_input_ros, train_output_ros = RandomOverSampler(random_state=RANDOM_STATE).fit_resample(train_input, train_output)

## Modeling

In [10]:
numerical_pipeline = Pipeline([("encoder", StandardScaler())])
categorical_pipeline = Pipeline([("encoder", OneHotEncoder())])

preprocessing_pipeline = ColumnTransformer(
    [
        ("numerical_preprocessor", numerical_pipeline, NUMERICAL_FEATURES),
        ("categorical_pipeline", categorical_pipeline, CATEGORICAL_FEATURES),
    ]
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessing_pipeline),
        ("estimator", XGBClassifier(random_state=RANDOM_STATE)),
    ]
)

In [11]:
pipeline.fit(train_input_ros, train_output_ros)



In [12]:
#model_uri = mlflow.get_artifact_uri("model")
model_uri

'mlflow-artifacts:/585997797702861412/764b5c26a8a347e39c084d5ce2abc573/artifacts/model'

In [13]:
test_output = test_data[TARGET]
test_input = test_data[feature_columns]

predict_output = pipeline.predict(test_input)

In [14]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score

#test_input = test_data[feature_columns]
#test_output = test_data[TARGET]

bas = balanced_accuracy_score(test_output, predict_output)
f1 = f1_score(test_output, predict_output, average='weighted')

In [15]:
metrics = {'test_balanced_accuracy' : bas, 'test_f1' : f1}
mlflow.log_param("Param", pipeline)
mlflow.log_metrics(metrics)

In [16]:
result = mlflow.evaluate(model_uri, test_data, targets='verdict',model_type="classifier",evaluators=["default"])

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2024/12/04 17:11:30 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/12/04 17:11:33 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/12/04 17:11:37 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


In [17]:
#mlflow.evaluate(model_uri, test_data, targets='verdict',model_type="classifier",evaluators=["default"])

In [18]:
mlflow.end_run(status='FINISHED')

2024/12/04 17:11:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run sincere-ant-901 at: http://127.0.0.1:4000/#/experiments/585997797702861412/runs/764b5c26a8a347e39c084d5ce2abc573.
2024/12/04 17:11:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:4000/#/experiments/585997797702861412.
