# Ce fichier a pour but de préparer les datasets de test et d'entrainement

In [14]:
import numpy as np
import pandas as pd
import csv
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy import SpotifyException
import os

## Retirer colonnes inutiles et doublons

In [15]:
csv_filename = 'data/spotify_dataset.csv'

df = pd.read_csv(csv_filename, on_bad_lines='skip')

df.columns = df.columns.str.strip().str.replace('"', '')
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()

# Ajouter l'id de l'user aux playlists 'Starred' et 'Liked from Radio'
def update_playlist_name(row):
    if row['playlistname'] == 'Starred':
        return row['playlistname'] + row['user_id']
    elif row['playlistname'] == 'Liked from Radio':
        return row['playlistname'] + row['user_id']
    return row['playlistname']

# Appliquer la fonction à chaque ligne du DataFrame
df_cleaned['playlistname'] = df_cleaned.apply(update_playlist_name, axis=1)

# la colonne user_id n'est pas utile
# df_cleaned = df_cleaned.drop('user_id', axis=1)

In [16]:
df_cleaned.head()

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [17]:
print(df_cleaned.describe())

                                 user_id artistname trackname  \
count                           12856828   12856828  12856828   
unique                             15914     289602   2004520   
top     4398de6902abde3351347b048fcdc287  Daft Punk     Intro   
freq                              295274      36086      6675   

                                   playlistname  
count                                  12856828  
unique                                   165459  
top     Starred4398de6902abde3351347b048fcdc287  
freq                                      47362  


## Compter le nombre de colonnes

In [18]:
value_counts = df_cleaned['playlistname'].value_counts()

min_frequency = value_counts.min()
max_frequency = value_counts.max()
average = value_counts.mean()
median = value_counts.median()

print("Minimum frequency:", min_frequency)
print("Maximum frequency:", max_frequency)
print("Average frequency:", average)
print("Median frequency:", median)

Minimum frequency: 1
Maximum frequency: 47362
Average frequency: 77.70401126563075
Median frequency: 20.0


## Enlever les playlists qui ont moins de sons que la valeur médiane 

In [19]:
filtered_df = df_cleaned.groupby('playlistname').filter(lambda x: len(x) > median)
print(filtered_df.describe())

                                 user_id artistname trackname  \
count                           11972305   11972305  11972305   
unique                             14694     276342   1900562   
top     4398de6902abde3351347b048fcdc287  Daft Punk     Intro   
freq                              295274      34553      5859   

                                   playlistname  
count                                  11972305  
unique                                    81916  
top     Starred4398de6902abde3351347b048fcdc287  
freq                                      47362  


In [20]:
value_counts = filtered_df['playlistname'].value_counts()

min_frequency = value_counts.min()
max_frequency = value_counts.max()
average = value_counts.mean()
median = value_counts.median()

print("Minimum frequency:", min_frequency)
print("Maximum frequency:", max_frequency)
print("Average frequency:", average)
print("Median frequency:", median)

Minimum frequency: 21
Maximum frequency: 47362
Average frequency: 146.1534376678549
Median frequency: 51.0


## Récupérer les caractéristiques des sons grâce à l'API de Spotify

### credentials

In [21]:
client_id = 'a390c89e42c947fd806e973ffb0015e6'
client_secret = 'e2d01d5846e44084923d09afac2ef3a1'
redirect_uri = 'http://localhost:8888/callback'

spotify_client = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, scope='user-library-read'))

In [22]:
len(filtered_df)

11972305

In [23]:
test_df = filtered_df.head(1000) # 1000 sons = 3-4 minutes pour avoir les ids
test_df.head(10)

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
5,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010
6,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Band On The Run,HARD ROCK 2010
7,9cc0cfd4d7d7885102480dd99e7a90d6,Joe Echo,Beautiful,HARD ROCK 2010
8,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,"Blackbird - Live at CitiField, NYC - Digital A...",HARD ROCK 2010
9,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010


In [24]:
def get_track_ids(track_names, artist_names, dataframe):
    track_ids = []
    indices_to_delete = [] 
    for i, (track_name, artist_name) in enumerate(zip(track_names, artist_names)):
        results = spotify_client.search(q=f'track:{track_name} artist:{artist_name}', type='track', limit=1)
        if results['tracks']['items']:
            track_id = results['tracks']['items'][0]['id']
            track_ids.append(track_id) 
        else:
            indices_to_delete.append(i)

    if indices_to_delete: 
        dataframe = dataframe.drop(dataframe.index[indices_to_delete]).reset_index(drop=True)
        
    return track_ids, dataframe

In [25]:
track_ids, test_df = get_track_ids(test_df['trackname'].tolist(), test_df['artistname'].tolist(), test_df)
print(test_df)

                              user_id                        artistname  \
0    9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
1    9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
2    9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
3    9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
4    9cc0cfd4d7d7885102480dd99e7a90d6                            Lissie   
..                                ...                               ...   
885  07f0fc3be95dcd878966b1f9572ff670                          Deftones   
886  07f0fc3be95dcd878966b1f9572ff670                        Audioslave   
887  07f0fc3be95dcd878966b1f9572ff670                            John 5   
888  07f0fc3be95dcd878966b1f9572ff670                               C2C   
889  07f0fc3be95dcd878966b1f9572ff670                 Infected Mushroom   

                                             trackname    playlistname  
0                 (The Ang

In [36]:
split = [track_ids[i:i+50] for i in range(0, len(track_ids), 50)]
song_data = []
fieldnames = ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy',
              'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name', 
              'popularity', 'release_date', 'speechiness', 'tempo', 'uri']

for track_ids in split:
    audio_features = spotify_client.audio_features(track_ids)
    track_details = spotify_client.tracks(track_ids)['tracks']

    for i, item in enumerate(track_ids):
        audio_feature = audio_features[i]
        details = track_details[i]

        artists = ', '.join([artist['name'] for artist in details['artists']])

        song_info = {
            'id': item,
            'artists': artists,
            'key': audio_feature['key'],
            'danceability': audio_feature['danceability'],
            'liveness': audio_feature['liveness'],
            'valence': audio_feature['valence'],
            'year': details['album']['release_date'][:4],
            'acousticness': audio_feature['acousticness'],
            'duration_ms': audio_feature['duration_ms'],
            'energy': audio_feature['energy'],
            'explicit': int(details['explicit']),
            'instrumentalness': audio_feature['instrumentalness'],
            'loudness': audio_feature['loudness'],
            'mode': audio_feature['mode'],
            'popularity': details['popularity'],
            'speechiness': audio_feature['speechiness'],
            'tempo': audio_feature['tempo'],
            'release_date': details['album']['release_date']
        }

        song_data.append(song_info)

for i, data in enumerate(song_data):
    for col, value in data.items():
        if col not in test_df.columns:
            test_df[col] = None
        test_df.at[i, col] = value


In [27]:
print(len(test_df))
test_df.head()

890


Unnamed: 0,user_id,artistname,trackname,playlistname,id,artists,key,danceability,liveness,valence,...,duration_ms,energy,explicit,instrumentalness,loudness,mode,popularity,speechiness,tempo,release_date
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010,0GmLrYUBXDC5vti77zBZfJ,Elvis Costello,4,0.577,0.226,0.901,...,167680,0.636,0,0.00016,-9.825,1,38,0.0491,134.768,1977-07-22
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010,77XzsYwTkvLoveW01Lanrk,Elvis Costello & The Attractions,7,0.381,0.321,0.368,...,211630,0.918,0,0.0511,-9.451,1,39,0.0803,142.34,2020-11-06
2,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010,2Rv3hGdKSLeX30V55asE3s,Elvis Costello & The Attractions,0,0.608,0.285,0.736,...,181720,0.597,0,0.00342,-11.115,1,37,0.0276,120.077,1979-01-05
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,1v98rfd0an913AzHvMNG8a,Elvis Costello,1,0.557,0.112,0.377,...,204533,0.32,0,4e-05,-10.792,0,52,0.0438,176.647,1977-07-22
4,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010,6wM5ZSUB7RMr7zYfGiO6d9,Lissie,3,0.582,0.0974,0.533,...,250069,0.643,0,0.0,-6.047,1,6,0.0309,122.084,2021-06-04


In [28]:
# export du df préparé dans un csv pour pouvoir l'utiliser dans un autre fichier
test_df.to_csv('data/prepared_data.csv', index=False)

# Ajout de la colonne l'URI de la track dans copy_test_df

In [29]:
# test_df.head(1)

# print("colonnes avant : \n", test_df.columns)

# # Ajout une colonne URI
# def get_track_uri(track_name, artist_name, spotify_client):
#     max_length = 100  # Limite de longueur arbitraire pour éviter des requêtes trop longues
#     track_name = track_name[:max_length]
#     artist_name = artist_name[:max_length]
    
#     try:
#         results = spotify_client.search(q=f'track:{track_name} artist:{artist_name}', type='track', limit=1)
#         if results['tracks']['items']:
#             track_uri = results['tracks']['items'][0]['uri']
#             return track_uri
#         else:
#             return None
#     except spotipy.exceptions.SpotifyException as e:
#         print(f"Spotify error: {e}")
#         return None

# # Ajouter une colonne 'uri' contenant les identifiants URI dans copy_test_df
# #test_df['uri'] = test_df.apply(lambda row: get_track_uri(row['trackname'], row['artistname'], spotify_client), axis=1)

# print("colonnes apères : \n ", test_df.columns)


In [30]:
# print(test_df.columns)
# test_df.head(1)

# Ajout de la colonne l'URI de la track dans spotify_data

In [31]:
# spotify_data = pd.read_csv('./data/data.csv.zip')

# # Data format
# spotify_data['artists'] = spotify_data['artists'].str.replace("['", "", regex=False)
# spotify_data['artists'] = spotify_data['artists'].str.replace("']", "", regex=False)
# spotify_data['artists'] = spotify_data['artists'].str.replace("'", "", regex=False)

# spotify_data = spotify_data.rename(columns=lambda x: x.strip().replace('"', ''))
# spotify_data = spotify_data.rename(columns={'name': 'trackname', 'artists': 'artistname'})

# print("taille : ", len(spotify_data))
# print("colonnes avant : \n", spotify_data.columns)

In [32]:
# print("taille de spotify_data :", len(spotify_data))
# print("colonnes avant : \n", spotify_data.columns)

# # Déterminer où reprendre le traitement
# output_filename = 'new_spotify_data.csv'
# if os.path.exists(output_filename):
#     processed_data = pd.read_csv(output_filename)
#     processed_uris = processed_data['uri'].dropna().tolist()
#     last_processed_index = len(processed_uris)
# else:
#     last_processed_index = 0

# print(f"Reprenant à partir de l'index {last_processed_index}")

# # Traiter les données par lots de 10 chansons à partir du dernier point de reprise
# batch_size = 10
# for start in range(last_processed_index, len(spotify_data), batch_size):
#     end = start + batch_size
#     batch = spotify_data.iloc[start:end].copy()

#     # Ajouter la colonne 'uri' dans le batch
#     batch['uri'] = batch.apply(lambda row: get_track_uri(row['trackname'], row['artistname'], spotify_client), axis=1)

#     # Enregistrer le batch dans le fichier CSV
#     if start == last_processed_index and start == 0:
#         batch.to_csv(output_filename, index=False, mode='w', header=True)
#     else:
#         batch.to_csv(output_filename, index=False, mode='a', header=False)

#     print(f"Processed and saved batch {start} to {end}")

# print("Le fichier spotify_data_with_uri.csv a été créé avec les URI ajoutés.")
# print("colonnes après : \n", spotify_data.columns)

In [33]:
# new_spotify_data = pd.read_csv('new_spotify_data.csv')
# data_clean = pd.read_csv('data_clean.csv')

# # Afficher le nombre de chansons dans le nouveau fichier CSV
# nombre_de_chansons = len(new_spotify_data)
# print("Nombre de chansons dans le nouveau CSV :", nombre_de_chansons)

# print("colonnes après : \n", new_spotify_data.columns)


Nettoyage des URIs


# CHERCHER UN SON EN COMMUN DANS DATA DES PLAYLIST ET L'AUTRE

à faire

In [34]:
# # Filtrer les lignes avec des URIs valides
# new_spotify_data = new_spotify_data.dropna(subset=['uri'])
# data_clean = data_clean.dropna(subset=['uri'])

# # Rechercher les pistes en commun en utilisant l'URI
# common_tracks = pd.merge(new_spotify_data, data_clean, on='uri', how='inner', suffixes=('_spotify', '_data'))

# # Afficher le nombre de pistes en commun et les titres et artistes des pistes communes
# print(f"Nombre de pistes en commun: {len(common_tracks)}")

# ENREGISTRER EN new CSV test_df

In [35]:
# # Sauvegarder test_df en tant que nouveau fichier CSV
# output_filename = 'data_clean.csv'
# test_df.to_csv(output_filename, index=False)
# print(f"Le fichier {output_filename} a été créé.")