In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import Isomap, TSNE
from sklearn.cluster import HDBSCAN, KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import seaborn as sns
import colorcet as cc
import plotly.graph_objects as px
import umap
import plotly.graph_objects as go
from time import sleep
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
def visualize_dimensionality_reduction(df: pd.DataFrame, models: list, random_state: int = None):
    '''
    Visualize dimensionality reduction results for different models.

    Inputs:
    - df: pd.DataFrame
        The input DataFrame (assumed to be already scaled).
    - models: list
        List of dimensionality reduction models to visualize.
    - random_state: int or None, optional
        Random state for reproducibility.

    Returns:
    - None (displays scatterplots)
    '''

    models_folder = 'models'
    if not os.path.exists(models_folder):
        os.makedirs(models_folder)

    num_models = len(models)
    fig, axes = plt.subplots(nrows=1, ncols=num_models, figsize=(5 * num_models, 5))

    for i, model in enumerate(models):
        model_instance = None

        if model == TSNE:
            model_instance = model(n_components=3)  # TSNE does not support random_state during initialization
        else:
            try:
                # Try to provide random_state if the model supports it
                model_instance = model(n_components=3, random_state=random_state)
            except TypeError:
                # If the model does not support random_state, initialize without it
                model_instance = model(n_components=3)

        # Fit and transform the data
        reduced_data = model_instance.fit_transform(df)

        # Create a scatterplot for the first 2 components using Seaborn
        sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], ax=axes[i])
        axes[i].set_title(f'{model.__name__} Scatterplot (2D)')
        axes[i].set_xlabel('Component 1')
        axes[i].set_ylabel('Component 2')

        # Save the model as a pickle file in the "models" folder
        model_filename = os.path.join(models_folder, f'{model.__name__}.pkl')
        with open(model_filename, 'wb') as model_file:
            pickle.dump(model_instance, model_file)

        print(f'Saved {model.__name__} model as {model_filename}')

    plt.tight_layout()
    plt.show()

    # Display 3D scatterplots using Plotly for the first 3 components
    for i, model in enumerate(models):
        model_instance = None

        if model == TSNE:
            model_instance = model(n_components=3)  # TSNE does not support random_state during initialization
        else:
            model_instance = model(n_components=3)

        reduced_data = model_instance.fit_transform(df)

        # Plot 3D scatterplot using Plotly
        fig_3d = px.scatter_3d(reduced_data, x=reduced_data[:, 0], y=reduced_data[:, 1], z=reduced_data[:, 2],
                               title=f'{model.__name__} 3D Scatterplot')
        fig_3d.show()

def GetAudioFeatures(client: SpotifyClientCredentials, df: pd.DataFrame, columnID='id', chunk_size=100) -> pd.DataFrame:
    
    completed = 0
    tracks = df[columnID].to_list()
    result_df = pd.DataFrame()

    for i in range(0, len(tracks), chunk_size):
        try:
            responses = client.audio_features(tracks=tracks[i:i+chunk_size])
            
            for response in responses:
                dftemp = pd.DataFrame([response])
                result_df = pd.concat([result_df, dftemp], ignore_index=True)

        except BaseException as err:
            print(f'Error processing tracks ({i} : {i+chunk_size}) -> {err}')
        
        completed += chunk_size
        print(f'({completed}/{len(tracks)}) {round((completed/len(tracks)*100),1)}%')
        sleep(1)

    return pd.merge(df, result_df, on=columnID)

def GetSong(client: SpotifyClientCredentials, limit=5) -> pd.DataFrame:
    
    Song = input("Please input your song Search:")

    try: 
        responses = client.search(Song, limit)
    except BaseException as err:
        print(f'Error processing tracks ({Song}) -> {err}')

    song_names = []
    artists = []
    song_id = []

    for i in range(0, limit):
        song_names.append(responses['tracks']['items'][i]['artists'][0]['name'])
        artists.append(responses['tracks']['items'][i]['name'])
        song_id.append(responses['tracks']['items'][i]['id'])

    print('The following Songs were found from Spotify, please indicate the track you would like to select')
    Selected_songs = pd.DataFrame({"Songs": song_names, "Artists": artists, "Spotify Song ID": song_id})
    display(Selected_songs)
    Selected_Song = input()

    print(f"Song {Selected_Song} Selected, downloading Spotify Data ...")

    Selected_Song_df = Selected_songs.loc[[int(Selected_Song)]]
    Selected_Song_df.columns = ['song', 'artist', 'id']
    return GetAudioFeatures(client=client, df=Selected_Song_df, chunk_size=1)

def drop_columns(df:pd.DataFrame, drop):
    df2 = df.copy()
    for col in drop:
        if col in df2.columns:
            df2 = df2.drop(col, axis=1)
    return df2

def SaveModel(model, name):

    # Check if folder exists
    models_folder = './models'
    if not os.path.exists(models_folder):
        os.makedirs(models_folder)

    # Save the model as a pickle file in the "models" folder
    model_filename = os.path.join(models_folder, f'{name}.pkl')
    with open(model_filename, 'wb') as model_file:
        pickle.dump(model, model_file)
    

def StandardScaler_(df:pd.DataFrame, scaler = None):
    # Train Scaler if not provided
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(df)
        SaveModel(scaler, 'scaler')
    
    # Transform Data
    X_scaled = scaler.transform(df)
    return pd.DataFrame(X_scaled, columns = df.columns)

def UMAP_(df:pd.DataFrame, reducer = None):
    # Train UMAP if not provided
    if reducer is None:
        reducer = umap.UMAP(n_components=3, random_state=69)
        reducer.fit(df)
        SaveModel(reducer, 'reducer')

    X_umap_transformed = reducer.transform(df)
    return pd.DataFrame(X_umap_transformed, columns=["UMAP_1", "UMAP_2", "UMAP_3"])


def HDBSCAN_(df:pd.DataFrame):
    df2 = df.copy()

    model = HDBSCAN(min_samples=24)
    SaveModel(model, 'hdbscan')

    yhat = model.fit_predict(df2)
    df2["cluster"] = yhat
    return df2


def load_pickle_model(model_path):
    try:
        with open(model_path, 'rb') as file:
            loaded_model = pickle.load(file)
        return loaded_model
    except FileNotFoundError:
        print(f"Error: The file {model_path} was not found.")
        return None
    except Exception as e:
        print(f"Error loading the pickle model: {e}")
        return None
    
def SearchBulk(client: SpotifyClientCredentials, tracks:list, chunk_size=5) -> pd.DataFrame:
    song_names = []
    artists = []

    for i in range(0, len(tracks)):
        try:
            response = client.search(tracks[i], limit=1)
            song_names.append(response['tracks']['items'][0]['name'])
            artists.append(response['tracks']['items'][0]['artists'][0]['name'])

        except BaseException as err:
            print(f'Error processing tracks ({i}) -> {err}')
        
        if i % chunk_size == 0:
            sleep(1)
            print(f'({i}/{len(tracks)}) {round((i/len(tracks)*100),1)}%')
    
    return pd.DataFrame({"Song Name":song_names, "Artists": artists})

def determine_hotness(df1: pd.DataFrame, df2: pd.DataFrame):
    '''
    Takes a dataframe with 1 song (df1), checks if the id exists in another dataframe (df2):
    - If it exists:
        - If the existing id hotness is 1 -> set hot_or_not to 1 for the song in df1
        - If the existing id hotness is 0 -> set hot_or_not to 0 for the song in df1
    - If it does not exist:
        - Set hot_or_not to 0 for the song in df1
    - Returns df1
    '''
    if df1['id'].iloc[0] in df2['id'].values:
        if df2.loc[df2['id'] == df1['id'].iloc[0], 'hot_or_not'].any() == 1:
            df1['hot_or_not'] = 1
        else:
            df1['hot_or_not'] = 0
    else:
        df1['hot_or_not'] = 0

    return df1

def GetRecommened(pick, main, main_include_name, client: SpotifyClientCredentials):
    
    # Clean Pick columns
    pick = drop_columns(pick, ['Unnamed: 0', 
                               'analysis_url', 
                               'track_href', 
                               'uri', 
                               'type', 
                               'artists', 
                               'name', 
                               'Unnamed: 0.1', 
                               'genre',	
                               'song_name', 
                               'title', 
                               'Artists', 
                               'duration_ms', 
                               'time_signature', 
                               'song', 
                               'artist'])
    
    pick['hot_or_not'] = 0 # change this to function to determine if hot or not
    pick.set_index('id', inplace=True)
    

    # Standard Scaler
    scaler = load_pickle_model("./models/scaler.pkl")
    pick_scaled = StandardScaler_(df=pick, scaler=scaler)

    # UMAP
    reducer = load_pickle_model("./models/reducer.pkl")
    pick_UMAP = UMAP_(df=pick_scaled, reducer=reducer)

    # concat song list and picked song
    pick_merged = pd.concat([pick_UMAP, main])

    # HDBSCAN
    pick_scanned = HDBSCAN_(df=pick_merged)

    # Split dataframs into two
    pick_cluster = pick_scanned.iloc[[0]]['cluster'].iloc[0]

    clusters = pick_scanned.reset_index(drop=True)
    clusters = clusters.drop(clusters.index[0])

    # Get 5 random songs from same cluster
    random_picks = clusters[clusters['cluster'] == pick_cluster].sample(5)
    list_recommended = list(random_picks.index)

    print("Here are 5 recommended Songs for you to listen to:")
    display(SearchBulk(client, list(main_include_name.iloc[list_recommended]['song_name'])))


Input CSVs from ./Songs And combine to create Main DataFrame

In [3]:
# input Pandas code to load CSVs correctly to TOP100 & NOT_HOT pd dataFrames
try: 
    NOT_HOT = pd.read_csv("./songs/NOT_HOT.csv")
    TOP100 = pd.read_csv("./songs/TOP100.csv")
    display(NOT_HOT.head(), TOP100.head())
except:
    raise ValueError ("No Database for Hot or NotHot Songs found")

Unnamed: 0.2,Unnamed: 0.1,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,uri,track_href,analysis_url,duration_ms,time_signature,genre,song_name,Unnamed: 0,title,Artists
0,0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,...,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Dark Trap,Mercury: Retrograde,,,Ghostemane
1,1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,...,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Dark Trap,Pathology,,,Don Kenobi
2,2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,...,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Dark Trap,Symbiote,,,gizmo
3,3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,...,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,Dark Trap,ProductOfDrugs (Prod. The Virus and Antidote),,,Kamiyada+
4,4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,...,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Dark Trap,Venom,,,$uicideboy$


Unnamed: 0.1,Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,...,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
0,0,4xhsWYTOGcal8zt0J161CU,Lovin On Me,Jack Harlow,0.943,0.558,2,-4.911,1,0.0568,...,2e-06,0.0937,0.606,104.983,audio_features,spotify:track:4xhsWYTOGcal8zt0J161CU,https://api.spotify.com/v1/tracks/4xhsWYTOGcal...,https://api.spotify.com/v1/audio-analysis/4xhs...,138411,4
1,1,1BxfuPKGuaTgP7aM0Bbdwr,Cruel Summer,Taylor Swift,0.552,0.702,9,-5.707,1,0.157,...,2.1e-05,0.105,0.564,169.994,audio_features,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,https://api.spotify.com/v1/tracks/1BxfuPKGuaTg...,https://api.spotify.com/v1/audio-analysis/1Bxf...,178427,4
2,2,3rUGC1vUpkDG9CZFHMur1t,greedy,Tate McRae,0.75,0.733,6,-3.18,0,0.0319,...,0.0,0.114,0.844,111.018,audio_features,spotify:track:3rUGC1vUpkDG9CZFHMur1t,https://api.spotify.com/v1/tracks/3rUGC1vUpkDG...,https://api.spotify.com/v1/audio-analysis/3rUG...,131872,1
3,3,2IGMVunIBsBLtEQyoI1Mu7,Paint The Town Red,Doja Cat,0.868,0.538,5,-8.603,1,0.174,...,3e-06,0.0901,0.732,99.968,audio_features,spotify:track:2IGMVunIBsBLtEQyoI1Mu7,https://api.spotify.com/v1/tracks/2IGMVunIBsBL...,https://api.spotify.com/v1/audio-analysis/2IGM...,231750,4
4,4,4KULAymBBJcPRpk1yO4dOG,I Remember Everything (feat. Kacey Musgraves),Zach Bryan,0.429,0.453,0,-7.746,1,0.0459,...,2e-06,0.102,0.155,77.639,audio_features,spotify:track:4KULAymBBJcPRpk1yO4dOG,https://api.spotify.com/v1/tracks/4KULAymBBJcP...,https://api.spotify.com/v1/audio-analysis/4KUL...,227196,4


Clean Up Column Names of CSVs

In [4]:
TOP100 = drop_columns(TOP100, ['Unnamed: 0', 'analysis_url', 'track_href', 'uri', 'type', 'artists', 'duration_ms', 'time_signature', 'name'])
NOT_HOT = drop_columns(NOT_HOT, ['Unnamed: 0', 'analysis_url', 'track_href', 'uri', 'type', 'artists', 'Unnamed: 0.1', 'genre', 'title', 'Artists', 'duration_ms', 'time_signature'])

TOP100['hot_or_not'] = 1
NOT_HOT['hot_or_not'] = 0

TOP100.set_index('id', inplace=True)
NOT_HOT.set_index('id', inplace=True)

In [5]:
main_include_name = pd.concat([TOP100, NOT_HOT])
main = main_include_name.drop('song_name', axis=1)

In [6]:
main_include_name

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,hot_or_not,song_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4xhsWYTOGcal8zt0J161CU,0.943,0.558,2,-4.911,1,0.0568,0.00260,0.000002,0.0937,0.6060,104.983,1,
1BxfuPKGuaTgP7aM0Bbdwr,0.552,0.702,9,-5.707,1,0.1570,0.11700,0.000021,0.1050,0.5640,169.994,1,
3rUGC1vUpkDG9CZFHMur1t,0.750,0.733,6,-3.180,0,0.0319,0.25600,0.000000,0.1140,0.8440,111.018,1,
2IGMVunIBsBLtEQyoI1Mu7,0.868,0.538,5,-8.603,1,0.1740,0.26900,0.000003,0.0901,0.7320,99.968,1,
4KULAymBBJcPRpk1yO4dOG,0.429,0.453,0,-7.746,1,0.0459,0.55400,0.000002,0.1020,0.1550,77.639,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1YiFZjmohJD78nlsSjjHGT,0.644,0.301,10,-8.927,0,0.0308,0.03160,0.856000,0.0919,0.0768,119.989,0,Focus
7o5E34q2K5cpQfy2P1WEGW,0.603,0.555,8,-10.284,0,0.0277,0.01680,0.907000,0.1070,0.1290,220.036,0,The Cure
0Wdp9TyN3PqCoxSzH8vRwO,0.626,0.367,8,-11.263,1,0.0343,0.00349,0.956000,0.3730,0.1290,130.002,0,Blocks
2K7NWYBbqGzNLggH7LIhxy,0.616,0.572,11,-9.371,0,0.0755,0.01100,0.921000,0.1440,0.0389,113.338,0,Liu Kang


In [7]:
main

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,hot_or_not
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4xhsWYTOGcal8zt0J161CU,0.943,0.558,2,-4.911,1,0.0568,0.00260,0.000002,0.0937,0.6060,104.983,1
1BxfuPKGuaTgP7aM0Bbdwr,0.552,0.702,9,-5.707,1,0.1570,0.11700,0.000021,0.1050,0.5640,169.994,1
3rUGC1vUpkDG9CZFHMur1t,0.750,0.733,6,-3.180,0,0.0319,0.25600,0.000000,0.1140,0.8440,111.018,1
2IGMVunIBsBLtEQyoI1Mu7,0.868,0.538,5,-8.603,1,0.1740,0.26900,0.000003,0.0901,0.7320,99.968,1
4KULAymBBJcPRpk1yO4dOG,0.429,0.453,0,-7.746,1,0.0459,0.55400,0.000002,0.1020,0.1550,77.639,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1YiFZjmohJD78nlsSjjHGT,0.644,0.301,10,-8.927,0,0.0308,0.03160,0.856000,0.0919,0.0768,119.989,0
7o5E34q2K5cpQfy2P1WEGW,0.603,0.555,8,-10.284,0,0.0277,0.01680,0.907000,0.1070,0.1290,220.036,0
0Wdp9TyN3PqCoxSzH8vRwO,0.626,0.367,8,-11.263,1,0.0343,0.00349,0.956000,0.3730,0.1290,130.002,0
2K7NWYBbqGzNLggH7LIhxy,0.616,0.572,11,-9.371,0,0.0755,0.01100,0.921000,0.1440,0.0389,113.338,0


Scale Data with Standard Scaler

In [8]:
# Apply Standard Scaler Function to the Dataset
X_scaled_df = StandardScaler_(main)

Perform UMAP based on function above / 3 axis

In [9]:
# Apply UMAP transformation to scaled data
X_umap_transformed_df = UMAP_(X_scaled_df)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


HDBSCAN

In [10]:
HDBSCAN_df = HDBSCAN_(X_umap_transformed_df)
HDBSCAN_df

Unnamed: 0,UMAP_1,UMAP_2,UMAP_3,cluster
0,17.956457,4.598516,8.466094,0
1,17.836294,4.631825,8.540279,0
2,18.174124,4.858671,8.519135,0
3,17.980301,4.532536,8.507689,0
4,17.721245,4.761799,9.171669,0
...,...,...,...,...
3095,-7.807477,2.936217,7.947117,1
3096,-7.006412,2.812489,8.343808,1
3097,4.843967,1.364753,6.161686,3
3098,-7.741531,3.079112,7.689047,1


Load Credentials from credentials file

In [11]:
try:
    import credentials as c
    a, b = c.a, c.b
    #Initialize SpotiPy with user credentias #
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=a,
                                                            client_secret=b))
except:
    raise ValueError ("No Credentials or Spotify Connection was made, please make sure all files are in order")

Get Song Audio Features and Song ID from Spotipy

In [28]:
running = True

while running:
    pick = GetSong(sp)
    GetRecommened(pick, X_umap_transformed_df, main_include_name, sp)
    if input("Want another Song? (Y/n)") == "n":
        running = False
    os.system
    

The following Songs were found from Spotify, please indicate the track you would like to select


Unnamed: 0,Songs,Artists,Spotify Song ID
0,League of Legends,Legends Never Die,1FpVJ7HpZInE2GvhVE2TwT
1,Juice WRLD,Legends,1Knctxx9vGZxpZfF66BIEa
2,Sam Tinnesz,Legends Are Made,0yqrhHrcWLMjBylHReDN5u
3,Juice WRLD,Legends,6BjtaWm1T4kDWAqHrf8vEi
4,League of Legends,Legends Never Die - (Remix),66YtIqT0kN4958EXnCnAmE


Song 0 Selected, downloading Spotify Data ...
(1/1) 100.0%
Here are 5 recommended Songs for you to listen to:
(0/5) 0.0%


Unnamed: 0,Song Name,Artists
0,Serve It Out,Kracked Stickz
1,Finer Things,Polo G
2,She Feelin Nice (feat. Jamie Foxx),Pop Smoke
3,body bag (feat. YUNGBLUD & Bert McCracken of T...,Machine Gun Kelly
4,Undecided,Chris Brown
