# Ce fichier a pour but de préparer les datasets de test et d'entrainement

In [2]:
import numpy as np
import pandas as pd
import csv
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy import SpotifyException

## Retirer colonnes inutiles et doublons

In [3]:
csv_filename = 'data/spotify_dataset.csv'

df = pd.read_csv(csv_filename, on_bad_lines='skip')

df.columns = df.columns.str.strip().str.replace('"', '')
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()

# Ajouter l'id de l'user aux playlists 'Starred' et 'Liked from Radio'
def update_playlist_name(row):
    if row['playlistname'] == 'Starred':
        return row['playlistname'] + row['user_id']
    elif row['playlistname'] == 'Liked from Radio':
        return row['playlistname'] + row['user_id']
    return row['playlistname']

# Appliquer la fonction à chaque ligne du DataFrame
df_cleaned['playlistname'] = df_cleaned.apply(update_playlist_name, axis=1)

# la colonne user_id n'est pas utile
# df_cleaned = df_cleaned.drop('user_id', axis=1)

In [4]:
df_cleaned.head()

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [5]:
print(df_cleaned.describe())

                                 user_id artistname trackname  \
count                           12856828   12856828  12856828   
unique                             15914     289602   2004520   
top     4398de6902abde3351347b048fcdc287  Daft Punk     Intro   
freq                              295274      36086      6675   

                                   playlistname  
count                                  12856828  
unique                                   165459  
top     Starred4398de6902abde3351347b048fcdc287  
freq                                      47362  


## Compter le nombre de colonnes

In [6]:
value_counts = df_cleaned['playlistname'].value_counts()

min_frequency = value_counts.min()
max_frequency = value_counts.max()
average = value_counts.mean()
median = value_counts.median()

print("Minimum frequency:", min_frequency)
print("Maximum frequency:", max_frequency)
print("Average frequency:", average)
print("Median frequency:", median)

Minimum frequency: 1
Maximum frequency: 47362
Average frequency: 77.70401126563075
Median frequency: 20.0


## Enlever les playlists qui ont moins de sons que la valeur médiane 

In [7]:
filtered_df = df_cleaned.groupby('playlistname').filter(lambda x: len(x) > median)
print(filtered_df.describe())

                                 user_id artistname trackname  \
count                           11972305   11972305  11972305   
unique                             14694     276342   1900562   
top     4398de6902abde3351347b048fcdc287  Daft Punk     Intro   
freq                              295274      34553      5859   

                                   playlistname  
count                                  11972305  
unique                                    81916  
top     Starred4398de6902abde3351347b048fcdc287  
freq                                      47362  


In [8]:
value_counts = filtered_df['playlistname'].value_counts()

min_frequency = value_counts.min()
max_frequency = value_counts.max()
average = value_counts.mean()
median = value_counts.median()

print("Minimum frequency:", min_frequency)
print("Maximum frequency:", max_frequency)
print("Average frequency:", average)
print("Median frequency:", median)

Minimum frequency: 21
Maximum frequency: 47362
Average frequency: 146.1534376678549
Median frequency: 51.0


## Récupérer les caractéristiques des sons grâce à l'API de Spotify

### credentials

In [9]:
client_id = 'clientid'
client_secret = 'clientsecret'
redirect_uri = 'http://localhost:8888/callback'

spotify_client = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, scope='user-library-read playlist-modify-public'))

In [109]:
test_df = filtered_df.head(1000) # 1000 sons = 3-4 minutes pour avoir les ids
test_df.head(10)

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
5,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010
6,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Band On The Run,HARD ROCK 2010
7,9cc0cfd4d7d7885102480dd99e7a90d6,Joe Echo,Beautiful,HARD ROCK 2010
8,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,"Blackbird - Live at CitiField, NYC - Digital A...",HARD ROCK 2010
9,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010


In [112]:
def get_track_ids(track_names, artist_names, dataframe):
    track_ids = []
    indices_to_delete = [] 
    for i, (track_name, artist_name) in enumerate(zip(track_names, artist_names)):
        results = spotify_client.search(q=f'track:{track_name} artist:{artist_name}', type='track', limit=1)
        if results['tracks']['items']:
            track_id = results['tracks']['items'][0]['id']
            track_ids.append(track_id) 
        else:
            indices_to_delete.append(i)

    if indices_to_delete: 
        dataframe = dataframe.drop(dataframe.index[indices_to_delete]).reset_index(drop=True)
        
    return track_ids, dataframe

In [113]:
track_ids, test_df = get_track_ids(test_df['trackname'].tolist(), test_df['artistname'].tolist(), test_df)
print(test_df)

                              user_id                        artistname  \
0    9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
1    9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
2    9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
3    9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
4    9cc0cfd4d7d7885102480dd99e7a90d6                            Lissie   
..                                ...                               ...   
885  07f0fc3be95dcd878966b1f9572ff670                          Deftones   
886  07f0fc3be95dcd878966b1f9572ff670                        Audioslave   
887  07f0fc3be95dcd878966b1f9572ff670                            John 5   
888  07f0fc3be95dcd878966b1f9572ff670                               C2C   
889  07f0fc3be95dcd878966b1f9572ff670                 Infected Mushroom   

                                             trackname    playlistname  
0                 (The Ang

In [114]:
split = [track_ids[i:i+50] for i in range(0, len(track_ids), 50)]
song_data = []
fieldnames = ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy',
              'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name', 
              'popularity', 'release_date', 'speechiness', 'tempo', 'uri']

for track_ids in split:
    audio_features = spotify_client.audio_features(track_ids)
    track_details = spotify_client.tracks(track_ids)['tracks']

    for i, item in enumerate(track_ids):
        audio_feature = audio_features[i]
        details = track_details[i]

        artists = ', '.join([artist['name'] for artist in details['artists']])

        song_info = {
            'artists': artists,
            'key': audio_feature['key'],
            'danceability': audio_feature['danceability'],
            'liveness': audio_feature['liveness'],
            'valence': audio_feature['valence'],
            'year': details['album']['release_date'][:4],
            'acousticness': audio_feature['acousticness'],
            'duration_ms': audio_feature['duration_ms'],
            'energy': audio_feature['energy'],
            'explicit': int(details['explicit']),
            'instrumentalness': audio_feature['instrumentalness'],
            'loudness': audio_feature['loudness'],
            'mode': audio_feature['mode'],
            'popularity': details['popularity'],
            'speechiness': audio_feature['speechiness'],
            'tempo': audio_feature['tempo'],
            'release_date': details['album']['release_date']
        }

        song_data.append(song_info)

for i, data in enumerate(song_data):
    for col, value in data.items():
        if col not in test_df.columns:
            test_df[col] = None
        test_df.at[i, col] = value


In [116]:
test_df.head()
print(len(test_df))

890


## Diviser le dataset en un set de test et un set d'entrainement

In [12]:
# enregister direct dans un csv