In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from bs4 import BeautifulSoup
import numpy as np
import requests
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
with open('vetting_playlist.json', 'r') as file:
    vetting = json.load(file)['tracks']['items']

# Start of Data

In [3]:
file_paths = [
    'mpd.slice.15000-15999.json',
    'mpd.slice.16000-16999.json',
    'mpd.slice.17000-17999.json',
    'mpd.slice.18000-18999.json',
    'mpd.slice.19000-19999.json',
    'mpd.slice.20000-20999.json',
    'mpd.slice.21000-21999.json',
    'mpd.slice.22000-22999.json',
    'mpd.slice.23000-23999.json',
    'mpd.slice.24000-24999.json',
    'mpd.slice.25000-25999.json',
    'mpd.slice.26000-26999.json',
    'mpd.slice.27000-27999.json',
    'mpd.slice.28000-28999.json',
    'mpd.slice.29000-29999.json',
    'mpd.slice.30000-30999.json',
]

all_data = []

for file_path in file_paths:
    with open(file_path, 'r') as file:
        data = json.load(file)
        for playlist in data['playlists']:
            playlist_name = playlist['name']
            for track in playlist['tracks']:
                all_data.append({
                    "playlist_name": playlist_name,
                    "position_in_playlist": track["pos"],
                    "track_id": track['track_uri'],
                    "track_name": track['track_name'],
                    "artist_name": track['artist_name'],
                    "duration_ms": track['duration_ms'],
                    "album_name": track['album_name']
                })
                '''
                # Add and remove for DBSCAN
                
                if len(all_data) >= 1000:
                    break
            if len(all_data) >= 1000:
                break
        if len(all_data) >= 1000:
            break
            '''
df = pd.DataFrame(all_data)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1077598 entries, 0 to 1077597
Data columns (total 7 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   playlist_name         1077598 non-null  object
 1   position_in_playlist  1077598 non-null  int64 
 2   track_id              1077598 non-null  object
 3   track_name            1077598 non-null  object
 4   artist_name           1077598 non-null  object
 5   duration_ms           1077598 non-null  int64 
 6   album_name            1077598 non-null  object
dtypes: int64(2), object(5)
memory usage: 57.6+ MB


In [13]:
df.head()

Unnamed: 0,playlist_name,position_in_playlist,track_id,track_name,artist_name,duration_ms,album_name
0,Wedding Music,0,spotify:track:2rb4cO7RczQFSvpjTJ4C2P,Always Forever,Phil Wickham,281880,Phil Wickham
1,Wedding Music,1,spotify:track:017nSBNU2XHwMV0NCWZCqg,Divine Romance,Phil Wickham,298026,Phil Wickham
2,Wedding Music,2,spotify:track:0W5TB5VNs0J16suh3r67P1,Messiah / You're Beautiful,Phil Wickham,293720,Cannons
3,Wedding Music,3,spotify:track:1gBnG1MiTNBBVzmuwP7Wii,Love Is Not A Fight,Warren Barfield,222986,Worth Fighting For
4,Wedding Music,4,spotify:track:0b99xsUKkETGwZGzpX987r,When I Say I Do,Matthew West,246000,Hold You Up EP


In [5]:
track_counts = df.groupby(['track_id', 'track_name', 'artist_name']).size().reset_index(name='count')

In [6]:
min_count = track_counts['count'].min()
max_count = track_counts['count'].max()
track_counts['popularity_score'] = (
    ((track_counts['count'] - min_count) / (max_count - min_count) * 9 + 1
).astype(int))

In [7]:
df_with_popularity = df.merge(
    track_counts[['track_id', 'popularity_score']],
    on='track_id',
    how='left'
)

In [9]:
df1 = pd.read_csv('tracks_features.csv')

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204025 entries, 0 to 1204024
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1204025 non-null  object 
 1   name              1204025 non-null  object 
 2   album             1204025 non-null  object 
 3   album_id          1204025 non-null  object 
 4   artists           1204025 non-null  object 
 5   artist_ids        1204025 non-null  object 
 6   track_number      1204025 non-null  int64  
 7   disc_number       1204025 non-null  int64  
 8   explicit          1204025 non-null  bool   
 9   danceability      1204025 non-null  float64
 10  energy            1204025 non-null  float64
 11  key               1204025 non-null  int64  
 12  loudness          1204025 non-null  float64
 13  mode              1204025 non-null  int64  
 14  speechiness       1204025 non-null  float64
 15  acousticness      1204025 non-null  float64
 16  

In [12]:
df1.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02


# Popularity Score

## What Does the Score Tell You?
### High Popularity Score (8-10):

#### Tracks with high scores are included in many playlists, indicating they are widely popular or well-liked.

### Medium Popularity Score (4-7):

#### Tracks with medium scores are moderately popular and appear in a reasonable number of playlists.

### Low Popularity Score (1-3):

#### Tracks with low scores are less popular and appear in only a few playlists.

In [None]:
df_with_popularity.info()

In [None]:
df_with_popularity['popularity_score']

In [None]:
df_with_popularity[df_with_popularity['popularity_score'] == 6]

In [None]:
df.info()

In [None]:
df.head()

# This is where I started my work, feel free to view but is not accurate for the 1000 rows for DBSCAN.

In [None]:
track_name = df.drop_duplicates(subset=['track_name'])

In [None]:
track_name.info()

In [None]:
artists = df.drop_duplicates(subset=['artist_name'])

In [None]:
artists.info()

In [None]:
playlists = df.drop_duplicates(subset=['playlist_name'])

In [None]:
playlists.info()

In [None]:
'''
X = df_with_popularity[['duration_ms', 'popularity_score']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

eps = 0.8
min_samples = 50

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(X_scaled)

df_with_popularity['cluster'] = dbscan.labels_

print(df_with_popularity.head())

n_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
n_noise = list(dbscan.labels_).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")

print(df_with_popularity['cluster'].value_counts())
'''

In [None]:
'''
X = df_with_popularity[['duration_ms', 'popularity_score']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

eps = 0.7
min_samples = 50

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(X_scaled)

df_with_popularity['cluster'] = dbscan.labels_

print(df_with_popularity.head())

n_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
n_noise = list(dbscan.labels_).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")

print(df_with_popularity['cluster'].value_counts())
'''

In [None]:
'''
X = df_with_popularity[['duration_ms', 'popularity_score']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

eps = 0.09
min_samples = 50

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(X_scaled)

df_with_popularity['cluster'] = dbscan.labels_

print(df_with_popularity.head())

n_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
n_noise = list(dbscan.labels_).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")

print(df_with_popularity['cluster'].value_counts())
'''