# Ce fichier a pour but de préparer les datasets de test et d'entrainement

In [10]:
import numpy as np
import pandas as pd
import csv
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy import SpotifyException

## Retirer colonnes inutiles et doublons

In [3]:
csv_filename = 'data/spotify_dataset.csv'

df = pd.read_csv(csv_filename, on_bad_lines='skip')

df.columns = df.columns.str.strip().str.replace('"', '')
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()

# Ajouter l'id de l'user aux playlists 'Starred' et 'Liked from Radio'
def update_playlist_name(row):
    if row['playlistname'] == 'Starred':
        return row['playlistname'] + row['user_id']
    elif row['playlistname'] == 'Liked from Radio':
        return row['playlistname'] + row['user_id']
    return row['playlistname']

# Appliquer la fonction à chaque ligne du DataFrame
df_cleaned['playlistname'] = df_cleaned.apply(update_playlist_name, axis=1)

# la colonne user_id n'est pas utile
df_cleaned = df_cleaned.drop('user_id', axis=1)

In [4]:
df_cleaned.head()

Unnamed: 0,artistname,trackname,playlistname
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,Elvis Costello,Alison,HARD ROCK 2010


In [5]:
print(df_cleaned.describe())

       artistname trackname                             playlistname
count    12856828  12856828                                 12856828
unique     289602   2004520                                   165459
top     Daft Punk     Intro  Starred4398de6902abde3351347b048fcdc287
freq        36086      6675                                    47362


## Compter le nombre de colonnes

In [6]:
value_counts = df_cleaned['playlistname'].value_counts()

min_frequency = value_counts.min()
max_frequency = value_counts.max()
average = value_counts.mean()
median = value_counts.median()

print("Minimum frequency:", min_frequency)
print("Maximum frequency:", max_frequency)
print("Average frequency:", average)
print("Median frequency:", median)

Minimum frequency: 1
Maximum frequency: 47362
Average frequency: 77.70401126563075
Median frequency: 20.0


## Enlever les playlists qui ont moins de sons que la valeur médiane 

In [8]:
filtered_df = df_cleaned.groupby('playlistname').filter(lambda x: len(x) > median)
print(filtered_df.describe())

       artistname trackname                             playlistname
count    11972305  11972305                                 11972305
unique     276342   1900562                                    81916
top     Daft Punk     Intro  Starred4398de6902abde3351347b048fcdc287
freq        34553      5859                                    47362


In [9]:
value_counts = filtered_df['playlistname'].value_counts()

min_frequency = value_counts.min()
max_frequency = value_counts.max()
average = value_counts.mean()
median = value_counts.median()

print("Minimum frequency:", min_frequency)
print("Maximum frequency:", max_frequency)
print("Average frequency:", average)
print("Median frequency:", median)

Minimum frequency: 21
Maximum frequency: 47362
Average frequency: 146.1534376678549
Median frequency: 51.0


## Récupérer les caractéristiques des sons grâce à l'API de Spotify

### credentials

In [11]:
client_id = 'client id'
client_secret = 'cleint secret'
redirect_uri = 'http://localhost:8888/callback'

spotify_client = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, scope='user-library-read playlist-modify-public'))

In [12]:
test_df = filtered_df.head(100)

In [23]:
# pas encore prêt
def get_track_features(track_name, artist_name):
    song_data = []

    results = spotify_client.search(q=f'track:{track_name} artist:{artist_name}', type='track', limit=1)

    if results['tracks']['items']:
        track_id = results['tracks']['items'][0]['id']
        audio_feature = spotify_client.audio_features(track_id)[0]
        details = spotify_client.track(track_id)
        artists = ', '.join([artist['name'] for artist in details['artists']])

        song_info = {
            'id': track_id,
            'name': track_name,
            'artists': artists,
            'key': audio_feature['key'],
            'danceability': audio_feature['danceability'],
            'liveness': audio_feature['liveness'],
            'valence': audio_feature['valence'],
            'year': details['album']['release_date'][:4],
            'acousticness': audio_feature['acousticness'],
            'duration_ms': audio_feature['duration_ms'],
            'energy': audio_feature['energy'],
            'explicit': int(details['explicit']),
            'instrumentalness': audio_feature['instrumentalness'],
            'loudness': audio_feature['loudness'],
            'mode': audio_feature['mode'],
            'popularity': details['popularity'],
            'speechiness': audio_feature['speechiness'],
            'tempo': audio_feature['tempo'],
            'release_date': details['album']['release_date']
        }

        song_data.append(song_info)

{'id': '08F16baYbciTva9P4BvpiI', 'name': 'MAFIA', 'artists': 'Travis Scott', 'key': 0, 'danceability': 0.855, 'liveness': 0.111, 'valence': 0.385, 'year': '2021', 'acousticness': 0.0775, 'duration_ms': 240000, 'energy': 0.492, 'explicit': 1, 'instrumentalness': 6.71e-06, 'loudness': -7.578, 'mode': 0, 'popularity': 69, 'speechiness': 0.0483, 'tempo': 109.975, 'release_date': '2021-11-05'}


## Diviser le dataset en un set de test et un set d'entrainement

In [None]:
# enregister direct dans un csv