In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [14]:
# Load data
df = pd.read_csv('high_popularity_spotify_data.csv')

In [15]:
# Simulate target: Randomly generate repeat_play (1 if repeated in a month, else 0)
np.random.seed(42)
df['repeat_play'] = np.random.choice([0, 1], size=len(df), p=[0.7, 0.3])


In [16]:
# Select features (excluding text and non-numeric ones)
features = ['energy', 'tempo', 'danceability', 'loudness', 'liveness',
            'valence', 'speechiness', 'instrumentalness', 'duration_ms', 'acousticness']

In [17]:
# Encode categorical genre if needed
if 'playlist_genre' in df.columns:
    le = LabelEncoder()
    df['genre_encoded'] = le.fit_transform(df['playlist_genre'])
    features.append('genre_encoded')

In [18]:
# Split data
X = df[features]
y = df['repeat_play']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [20]:
# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.89      0.77       228
           1       0.36      0.13      0.19       110

    accuracy                           0.64       338
   macro avg       0.52      0.51      0.48       338
weighted avg       0.57      0.64      0.58       338



In [21]:
# Recommend top songs by predicted repeat_play probability
df['repeat_prob'] = clf.predict_proba(X)[:, 1]
recommended_songs = df.sort_values(by='repeat_prob', ascending=False)[['id', 'track_artist', 'repeat_prob']].head(10)
print("Top 10 Recommended Songs:\n", recommended_songs)


Top 10 Recommended Songs:
                           id             track_artist  repeat_prob
1065  48UPSzbZjgc449aqz8bxox    Red Hot Chili Peppers     0.947500
52    48UPSzbZjgc449aqz8bxox    Red Hot Chili Peppers     0.947500
445   3vv9phIu6Y1vX3jcqaGz5Z  The Chainsmokers, ROZES     0.940000
715   6Qyc6fS4DsZjB2mRW9DsQs        The Goo Goo Dolls     0.940000
810   6Qyc6fS4DsZjB2mRW9DsQs        The Goo Goo Dolls     0.940000
1311  3vv9phIu6Y1vX3jcqaGz5Z  The Chainsmokers, ROZES     0.940000
1060  60a0Rd6pjrkxjPbaKzXjfq              Linkin Park     0.936667
678   6dOtVTDdiauQNBQEDOtlAB            Billie Eilish     0.920000
1     6dOtVTDdiauQNBQEDOtlAB            Billie Eilish     0.920000
1669  2PYlRGF5Wi5sDobsLH5gLX   Tiakola, Ryflo, Oskoow     0.920000
