# Lab | Unsupervised learning intro

In [1]:
# Step 1: Data Collection and Audio Feature Extraction

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd


client_id = 'f5b49755ca164915be68aec842025edf'
client_secret = '7fc1dfcac90d4c1e9e5cfb8446426250'


sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

def get_songs_from_playlist(playlist_id):
    songs = []
    results = sp.playlist_tracks(playlist_id)
    for item in results['items']:
        track = item['track']
        songs.append({
            'name': track['name'],
            'artist': track['artists'][0]['name'],
            'id': track['id']
        })
    while results['next']:
        results = sp.next(results)
        for item in results['items']:
            track = item['track']
            songs.append({
                'name': track['name'],
                'artist': track['artists'][0]['name'],
                'id': track['id']
            })
    return songs


playlist_id = '12Wbv8sIx84T5uh6iOoJ7V'
collected_songs = get_songs_from_playlist(playlist_id)
print(f"Collected {len(collected_songs)} songs.")


def get_audio_features(songs):
    features_list = []
    for song in songs:
        features = sp.audio_features(song['id'])[0]
        if features:
            features_list.append({
                'name': song['name'],
                'artist': song['artist'],
                'id': song['id'],
                'danceability': features['danceability'],
                'energy': features['energy'],
                'key': features['key'],
                'loudness': features['loudness'],
                'mode': features['mode'],
                'speechiness': features['speechiness'],
                'acousticness': features['acousticness'],
                'instrumentalness': features['instrumentalness'],
                'liveness': features['liveness'],
                'valence': features['valence'],
                'tempo': features['tempo']
            })
    return pd.DataFrame(features_list)


collected_features_df = get_audio_features(collected_songs)
print(f"Extracted features for {len(collected_features_df)} songs.")


Collected 217 songs.
Extracted features for 217 songs.


In [3]:
# Step 2: Data Loading
import pandas as pd


file_path = r"C:\Users\ashis\Downloads\data for ironheck\spotify songs.csv"

spotify_songs_df = pd.read_csv(file_path)


print(spotify_songs_df.head())

print(f"Loaded {len(spotify_songs_df)} songs.")


        date  rank           song                         artist  last-week  \
0  11/6/2021     1     Easy On Me                          Adele        1.0   
1  11/6/2021     2           Stay  The Kid LAROI & Justin Bieber        2.0   
2  11/6/2021     3  Industry Baby        Lil Nas X & Jack Harlow        3.0   
3  11/6/2021     4     Fancy Like                   Walker Hayes        4.0   
4  11/6/2021     5     Bad Habits                     Ed Sheeran        5.0   

   peak-rank  weeks-on-board  
0          1               3  
1          1              16  
2          1              14  
3          3              19  
4          2              18  
Loaded 330087 songs.


In [2]:
#Step 3:Combine with a Larger Dataset
import pandas as pd
import os


local_file_path = r"C:\Users\ashis\Downloads\data for ironheck\spotify songs.csv"


if os.path.exists(local_file_path):
    kaggle_songs_df = pd.read_csv(local_file_path)
    print("Kaggle songs file loaded successfully.")
else:
    print("Kaggle songs file not found. Please check the file path.")


collected_features_df = pd.DataFrame({
    'name': [],      
    'artist': [],
   
})


combined_songs_df = pd.concat([collected_features_df, kaggle_songs_df], ignore_index=True)


combined_songs_df.drop_duplicates(subset=['name', 'artist'], inplace=True)


print(f"Combined dataset contains {len(combined_songs_df)} songs.")


Kaggle songs file loaded successfully.
Combined dataset contains 10205 songs.


In [9]:
# Step 4: Clustering Using Unsupervised Learning
# We'll use K-means to cluster the songs based on their audio features.
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import os


local_file_path = r"C:\Users\ashis\Downloads\data for ironheck\spotify songs.csv"


if os.path.exists(local_file_path):
    kaggle_songs_df = pd.read_csv(local_file_path)
    print("File loaded successfully.")
else:
    print("File not found. Please check the file path.")

#
combined_songs_df = pd.concat([collected_features_df, kaggle_songs_df], ignore_index=True)


combined_songs_df.drop_duplicates(subset=['name', 'artist'], inplace=True)
print(f"Combined dataset contains {len(combined_songs_df)} songs.")


print("Available columns in combined_songs_df:")
print(combined_songs_df.columns.tolist())


features = ['danceability', 'energy', 'loudness', 'speechiness', 
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']


if all(feature in combined_songs_df.columns for feature in features):
 
    scaler = StandardScaler()
    combined_songs_df[features] = scaler.fit_transform(combined_songs_df[features])

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=10, random_state=42)  # Adjust the number of clusters as needed
    combined_songs_df['cluster'] = kmeans.fit_predict(combined_songs_df[features])

    print("Clustering complete. Songs are assigned to clusters.")
else:
    print("One or more features are missing from the combined dataset.")


File loaded successfully.
Combined dataset contains 10422 songs.
Available columns in combined_songs_df:
['name', 'artist', 'id', 'date', 'rank', 'song', 'last-week', 'peak-rank', 'weeks-on-board']
One or more features are missing from the combined dataset.


In [10]:
print("Available columns in collected_features_df:")
print(collected_features_df.columns.tolist())


Available columns in collected_features_df:
['name', 'artist', 'id']


In [1]:
# Step 5: spotify API
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd

client_id = 'f5b49755ca164915be68aec842025edf'
client_secret = '7fc1dfcac90d4c1e9e5cfb8446426250'
redirect_uri = 'http://localhost:9090/' 

from spotipy.oauth2 import SpotifyClientCredentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))
  

track_ids = track_ids = [
    '0A0eOcimSNRs2EQQlH7FFJ',  # Track ID 1
    '0MtVmhAx6CxNuxFIUc6Mj9',  # Track ID 2
    '6aUAF8JOd8zEl41B6I18xL', ] # Track ID 3
#['12Wbv8sIx84T5uh6iOoJ7V' ]
   


def fetch_audio_features(track_ids):
    audio_features_list = []

    for i in range(0, len(track_ids), 100):
        batch_ids = track_ids[i:i + 100]
        audio_features = sp.audio_features(batch_ids)
        audio_features_list.extend(audio_features)

    return audio_features_list

audio_features = fetch_audio_features(track_ids)


audio_features_df = pd.DataFrame(audio_features)


audio_features_df.to_csv('audio_features.csv', index=False)

print(audio_features_df.head())


   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.291   0.319    7   -10.465     1       0.0306         0.791   
1         0.171   0.626    8    -8.677     1       0.0486         0.873   
2         0.324   0.776    0    -6.784     1       0.0346         0.151   

   instrumentalness  liveness  valence    tempo            type  \
0            0.4680    0.0692    0.038  103.793  audio_features   
1            0.0252    0.0681    0.457  180.098  audio_features   
2            0.9170    0.0728    0.317  101.964  audio_features   

                       id                                   uri  \
0  0A0eOcimSNRs2EQQlH7FFJ  spotify:track:0A0eOcimSNRs2EQQlH7FFJ   
1  0MtVmhAx6CxNuxFIUc6Mj9  spotify:track:0MtVmhAx6CxNuxFIUc6Mj9   
2  6aUAF8JOd8zEl41B6I18xL  spotify:track:6aUAF8JOd8zEl41B6I18xL   

                                          track_href  \
0  https://api.spotify.com/v1/tracks/0A0eOcimSNRs...   
1  https://api.spotify.com/v1/tracks/0MtVmhAx6CxN

In [3]:

# Main Code: This includes your Spotify API setup, audio feature fetching, and KMeans model training.
# Scaler and Model Saving: This code block will handle saving the scaler and KMeans model to disk.
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pickle


client_id = 'f5b49755ca164915be68aec842025edf'
client_secret = '7fc1dfcac90d4c1e9e5cfb8446426250'
redirect_uri = 'http://localhost:9090/' 

from spotipy.oauth2 import SpotifyClientCredentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))




track_ids = [
    '0A0eOcimSNRs2EQQlH7FFJ',  # Track ID 1
    '0MtVmhAx6CxNuxFIUc6Mj9',  # Track ID 2
    '6aUAF8JOd8zEl41B6I18xL'    # Track ID 3
]


def fetch_audio_features(track_ids):
    audio_features_list = []
    for i in range(0, len(track_ids), 100):
        batch_ids = track_ids[i:i + 100]
        audio_features = sp.audio_features(batch_ids)
        audio_features_list.extend(audio_features)

    return audio_features_list


audio_features = fetch_audio_features(track_ids)


audio_features_df = pd.DataFrame(audio_features)

 
audio_features_df.to_csv('audio_features.csv', index=False)


print(audio_features_df.head())


sample_data = {
    'danceability': [0.7, 0.8],
    'energy': [0.9, 0.6],
    'loudness': [-5, -10],
}


sample_features = pd.DataFrame(sample_data)


scaler = StandardScaler()
scaler.fit(sample_features)

kmeans = KMeans(n_clusters=2)  
kmeans.fit(sample_features)


combined_songs_df = pd.DataFrame({
    'name': ['Song A', 'Song B', 'Song C'],
    'artist': ['Artist A', 'Artist B', 'Artist C'],
    'cluster': [0, 1, 0] 
})




with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

print("Scaler and KMeans model saved successfully.")

def recommend_songs(song_name, artist_name):
    # Check if the song is in the Billboard Hot 200 (mockup example)
    billboard_songs = ['Shape of You', 'Song B']  
    if song_name in billboard_songs:
        print(f"{song_name} is on the Billboard Hot 200!")

   
    results = sp.search(q=f"{song_name} {artist_name}", limit=1, type='track')
    if not results['tracks']['items']:
        return "Song not found on Spotify."

    track_id = results['tracks']['items'][0]['id']
    features = sp.audio_features([track_id])[0]

   
    if features is None:
        return "Features not available for the song."

   
    song_features = scaler.transform([[features['danceability'], features['energy'], features['loudness']]])


    cluster_label = kmeans.predict(song_features)[0]


    similar_songs = combined_songs_df[combined_songs_df['cluster'] == cluster_label]

 
    num_recommendations = min(len(similar_songs), 5) 
    if num_recommendations == 0:
        return "No similar songs found in the same cluster."

    recommendations = similar_songs[['name', 'artist']].sample(num_recommendations)

    return recommendations

# Example usage
recommendations = recommend_songs('Shape of You', 'Ed Sheeran')
print("Recommended songs:")
print(recommendations)


   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.291   0.319    7   -10.465     1       0.0306         0.791   
1         0.171   0.626    8    -8.677     1       0.0486         0.873   
2         0.324   0.776    0    -6.784     1       0.0346         0.151   

   instrumentalness  liveness  valence    tempo            type  \
0            0.4680    0.0692    0.038  103.793  audio_features   
1            0.0252    0.0681    0.457  180.098  audio_features   
2            0.9170    0.0728    0.317  101.964  audio_features   

                       id                                   uri  \
0  0A0eOcimSNRs2EQQlH7FFJ  spotify:track:0A0eOcimSNRs2EQQlH7FFJ   
1  0MtVmhAx6CxNuxFIUc6Mj9  spotify:track:0MtVmhAx6CxNuxFIUc6Mj9   
2  6aUAF8JOd8zEl41B6I18xL  spotify:track:6aUAF8JOd8zEl41B6I18xL   

                                          track_href  \
0  https://api.spotify.com/v1/tracks/0A0eOcimSNRs...   
1  https://api.spotify.com/v1/tracks/0MtVmhAx6CxN



In [None]:
# 1.Modular Structure: The saving of the scaler and KMeans model is included in the main code but clearly separated by comments. We could even create a separate function for saving if desired.
# 2.Flexibility: We can call the saving code whenever you've completed training and wish to save your models.
# 3.Easier Maintenance: This structure makes it clear where each part of the code lives, allowing for easier updates and debugging.