## Imports

In [30]:
import numpy as np
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from collections import defaultdict
from scipy.spatial.distance import cdist
import plotly.express as px
import plotly.express as px
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean

## Créer un modèle prédictif

In [31]:
client_id = 'a390c89e42c947fd806e973ffb0015e6'
client_secret = 'e2d01d5846e44084923d09afac2ef3a1'
redirect_uri = 'http://localhost:8888/callback'

spotify_client = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, scope='user-library-read playlist-modify-public'))

## Ajout des moyennes des caractéristiques des playlists au dataset

In [32]:
csv_filename = 'data/prepared_data.csv'
spotify_data = pd.read_csv(csv_filename)

## K-means clustering

In [33]:
numeric_columns = ['key', 'danceability', 'liveness', 'valence', 'year', 'acousticness', 'duration_ms',
                   'energy', 'explicit', 'instrumentalness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

song_cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=70, random_state=42))
])

X = spotify_data[numeric_columns]
song_cluster_pipeline.fit(X)

# Predict cluster labels for each song
spotify_data['cluster_label'] = song_cluster_pipeline.predict(X)

# Count the number of songs in each cluster for each playlist
playlist_cluster_distribution = spotify_data.groupby(['playlistname', 'cluster_label']).size().unstack(fill_value=0)

In [34]:
def recommend_playlist(song_list, spotify_data, playlist_cluster_distribution):
    # Get the cluster labels for the given songs
    song_clusters = spotify_data[spotify_data['trackname'].isin(song_list)]['cluster_label']
    
    # Calculate the distribution of the given songs across clusters
    song_cluster_dist = song_clusters.value_counts(normalize=True).reindex(range(70), fill_value=0)
    
    # Find the most similar playlist based on Euclidean distance
    closest_playlist = None
    closest_distance = float('inf')
    for playlist in playlist_cluster_distribution.index:
        print(closest_playlist)
        playlist_dist = playlist_cluster_distribution.loc[playlist].values
        distance = euclidean(song_cluster_dist, playlist_dist)
        if distance < closest_distance:
            closest_distance = distance
            closest_playlist = playlist
    
    return closest_playlist

def visualize_clusters(spotify_data):
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(spotify_data[numeric_columns])
    pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
    pca_df['cluster_label'] = spotify_data['cluster_label']
    pca_df['name'] = spotify_data['trackname']
    pca_df['artists'] = spotify_data['artistname']
    
    fig = px.scatter(
        pca_df, 
        x='PC1', 
        y='PC2', 
        color='cluster_label', 
    )
    fig.update_layout(title='Clusters of Songs', xaxis_title='PC1', yaxis_title='PC2')
    fig.show()

visualize_clusters(spotify_data)

In [35]:
example_song_list = [
    "Für Elise",
    "Hungarian Dance No. 5",
    "Sonata No. 23 In F Minor, Op. 57 Appassionata"" Assai Allegro""",
]



recommended_playlist = recommend_playlist(example_song_list, spotify_data, playlist_cluster_distribution)
print(f"The recommended playlist is: {recommended_playlist}")

None
C418
C418
Classique
Classique
Classique
Ghibli songs
Ghibli songs
Ghibli songs
Ghibli songs
Ghibli songs
The recommended playlist is: Ghibli songs


## Créer un modèle de recommandation aléatoire

## Comparer le score du modèle prédictif à celui du modèle aléatoire

In [36]:
# dans une fonction qui s'occupe des tests