In [None]:
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)


In [None]:
# Définir le chemin du fichier (modifiez si nécessaire)
candidates = ['track_data_final.csv', 'spotify_data clean.csv', 'spotify_data_clean.csv']
DATA_PATH = next((p for p in candidates if os.path.exists(p)), None)
if DATA_PATH is None:
    raise FileNotFoundError(f"Aucun des fichiers attendus trouvés : {candidates}")
print('Chargement de :', DATA_PATH)
df = pd.read_csv(DATA_PATH, low_memory=False)
df.head()


                 track_id                           track_name  track_number  \
0  6pymOcrCnMuCWdgGVTvUgP                                    3            57   
1  2lWc1iJlz2NVcStV5fbtPG                               Clouds             1   
2  1msEuwSBneBKpVCZQcFTsU  Forever & Always (Taylor’s Version)            11   
3  7bcy34fBT2ap1L4bfPsl9q            I Didn't Change My Number             2   
4  0GLfodYacy3BJE7AI3A8en                             Man Down             7   

   track_popularity  track_duration_ms  explicit     artist_name  \
0                61             213173     False  Britney Spears   
1                67             158760     False           BUNT.   
2                63             225328     False    Taylor Swift   
3                72             158463      True   Billie Eilish   
4                57             267013     False         Rihanna   

   artist_popularity  artist_followers      artist_genres  \
0               80.0        17755451.0           

In [3]:
dm=pd.read_csv('spotify_data_clean.csv')
print(dm.head())
print(dm.info())
print(dm.isnull().sum())
print(dm.describe())


                 track_id                                       track_name  \
0  3EJS5LyekDim1Tf5rBFmZl                    Trippy Mane (ft. Project Pat)   
1  1oQW6G2ZiwMuHqlPpP27DB                                             OMG!   
2  7mdkjzoIYlf1rx9EtBpGmU                                      Hard 2 Find   
3  67rW0Zl7oB3qEpD5YWWE5w  Still Get Like That (ft. Project Pat & Starrah)   
4  15xptTfRBrjsppW0INUZjf                            ride me like a harley   

   track_number  track_popularity  explicit artist_name  artist_popularity  \
0             4                 0      True       Diplo                 77   
1             1                 0      True    Yelawolf                 64   
2             1                 4      True   Riff Raff                 48   
3             8                30      True       Diplo                 77   
4             2                 0      True     Rumelis                 48   

   artist_followers                      artist_genres  \
0   

# Exploration et nettoyage des données
Ce notebook effectue : exploration (colonnes, shape, dtypes, valeurs manquantes) et nettoyage (doublons, valeurs NULL, conversions de types, suppression de caractères spéciaux).

In [4]:
# Exploration rapide
print('Colonnes disponibles :')
print(list(df.columns))
print('\nNombre de lignes, colonnes :', df.shape)
print('\nTypes de données :')
print(df.dtypes)
print('\nRésumé info() :')
print(df.info())
# Valeurs manquantes
missing = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing / len(df) * 100).round(2)
pd.DataFrame({'missing_count': missing, 'missing_percent': missing_percent}).head(30)

Colonnes disponibles :
['track_id', 'track_name', 'track_number', 'track_popularity', 'track_duration_ms', 'explicit', 'artist_name', 'artist_popularity', 'artist_followers', 'artist_genres', 'album_id', 'album_name', 'album_release_date', 'album_total_tracks', 'album_type']

Nombre de lignes, colonnes : (8778, 15)

Types de données :
track_id               object
track_name             object
track_number            int64
track_popularity        int64
track_duration_ms       int64
explicit                 bool
artist_name            object
artist_popularity     float64
artist_followers      float64
artist_genres          object
album_id               object
album_name             object
album_release_date     object
album_total_tracks      int64
album_type             object
dtype: object

Résumé info() :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8778 entries, 0 to 8777
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              -

Unnamed: 0,missing_count,missing_percent
artist_name,4,0.05
artist_followers,4,0.05
artist_popularity,4,0.05
artist_genres,4,0.05
album_name,2,0.02
track_name,2,0.02
track_id,0,0.0
explicit,0,0.0
track_duration_ms,0,0.0
track_popularity,0,0.0


In [5]:
# 1) Supprimer les doublons
# Si les colonnes 'artist' et 'title' existent, utiliser comme subset sinon supprimer lignes entièrement dupliquées
dup_subset = None
if set(['artist','title']).issubset(df.columns):
    dup_subset = ['artist','title']
before = len(df)
if dup_subset:
    df = df.drop_duplicates(subset=dup_subset)
else:
    df = df.drop_duplicates()
after = len(df)
print(f'Doublons supprimés : {before - after}')
df.shape

Doublons supprimés : 0


(8778, 15)

In [6]:
# 2) Gérer les valeurs NULL - options : suppression ou imputation simple
# Par défaut : supprimer les lignes si les colonnes clefs manquent (artist/title) sinon garder
key_cols = [c for c in ['artist','title'] if c in df.columns]
if key_cols:
    before = len(df)
    df = df.dropna(subset=key_cols)
    print('Lignes supprimées pour valeurs NULL sur', key_cols, ':', before - len(df))
# Pour d'autres colonnes numériques, on peut imputer par la médiane si souhaité
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
for col in numeric_cols:
    if df[col].isnull().any():
        med = df[col].median()
        df[col] = df[col].fillna(med)
        print(f"Imputation médiane pour {col}: {med}")


Imputation médiane pour artist_popularity: 74.0
Imputation médiane pour artist_followers: 6272266.5


In [7]:
# 3) Convertir les types (exemples courants)
# Convertir 'year' en entier si présent
if 'year' in df.columns:
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
    print('Converted year -> Int64')
# Convertir 'popularity' en numérique
if 'popularity' in df.columns:
    df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce').fillna(0).astype(int)
    print('Converted popularity -> int')
# Convertir une colonne date courante si nommée 'release_date' ou similaire
date_candidates = [c for c in df.columns if 'date' in c.lower()]
for dc in date_candidates:
    df[dc] = pd.to_datetime(df[dc], errors='coerce')
    print(f'Converted {dc} -> datetime')
# Convertir durée (si en ms) vers secondes si colonne 'duration_ms' existe
if 'duration_ms' in df.columns:
    df['duration_sec'] = pd.to_numeric(df['duration_ms'], errors='coerce')/1000.0
    print('Ajout duration_sec à partir de duration_ms')
df.dtypes

Converted album_release_date -> datetime


track_id                      object
track_name                    object
track_number                   int64
track_popularity               int64
track_duration_ms              int64
explicit                        bool
artist_name                   object
artist_popularity            float64
artist_followers             float64
artist_genres                 object
album_id                      object
album_name                    object
album_release_date    datetime64[ns]
album_total_tracks             int64
album_type                    object
dtype: object