# Configurações

## Importações

In [2]:
import pandas as pd

## Definição do Dataframe

In [3]:
import os
from pathlib import Path
import shutil
import kagglehub

# Configurar o diretório de datasets
datasets_dir = Path("../datasets")
datasets_dir.mkdir(exist_ok=True)

print("Iniciando download do dataset via Kaggle Hub...")

try:
    # Download usando kagglehub
    path = kagglehub.dataset_download("bwandowando/spotify-songs-with-attributes-and-lyrics")
    print(f"Dataset baixado para: {path}")
    
    # Procurar APENAS pelo arquivo songs_with_attributes_and_lyrics.csv
    source_file = None
    for file in Path(path).glob("*.csv"):
        if file.name == "songs_with_attributes_and_lyrics.csv":
            source_file = file
            print(f"\nArquivo encontrado: {file.name}")
            break
    
    if source_file:
        # Copiar para a pasta datasets como spotify_songs.csv
        destination_csv = datasets_dir / "spotify_songs.csv"
        shutil.copy(source_file, destination_csv)
        print(f"Arquivo copiado para: {destination_csv.name}")
    else:
        print("❌ Arquivo songs_with_attributes_and_lyrics.csv não encontrado!")
        print("\nArquivos disponíveis no download:")
        for file in Path(path).glob("*.csv"):
            print(f"- {file.name}")
    
except Exception as e:
    print(f"Erro ao fazer download: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Iniciando download do dataset via Kaggle Hub...
Dataset baixado para: /home/dti-dev/.cache/kagglehub/datasets/bwandowando/spotify-songs-with-attributes-and-lyrics/versions/19

Arquivo encontrado: songs_with_attributes_and_lyrics.csv
Dataset baixado para: /home/dti-dev/.cache/kagglehub/datasets/bwandowando/spotify-songs-with-attributes-and-lyrics/versions/19

Arquivo encontrado: songs_with_attributes_and_lyrics.csv
Arquivo copiado para: spotify_songs.csv
Arquivo copiado para: spotify_songs.csv


In [4]:
df = pd.read_csv('../datasets/spotify_songs.csv')

# Conhecendo as Features

In [5]:
df.head()

Unnamed: 0,id,name,album_name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,lyrics
0,0Prct5TDjAnEgIqbxcldY9,!,UNDEN!ABLE,['HELLYEAH'],0.415,0.605,7,-11.157,1,0.0575,0.00116,0.838,0.471,0.193,100.059,79500.0,"He said he came from Jamaica,\n he owned a cou..."
1,2ASl4wirkeYm3OWZxXKYuq,!!,,Yxngxr1,0.788,0.648,7,-9.135,0,0.315,0.9,0.0,0.176,0.287,79.998,114000.0,"Fucked a bitch, now she running with my kids\n..."
2,69lcggVPmOr9cvPx9kLiiN,!!! - Interlude,Where I Belong EP,['Glowie'],0.0,0.0354,7,-20.151,0,0.0,0.908,0.0,0.479,0.0,0.0,11413.0,"Oh, my God, I'm going crazy\n"
3,4U7dlZjg1s9pjdppqZy0fm,!!De Repente!!,Un Palo Al Agua (20 Grandes Canciones),['Rosendo'],0.657,0.882,5,-6.34,1,0.0385,0.0074,1.3e-05,0.0474,0.939,123.588,198173.0,Continuamente se extraña la gente si no puede ...
4,4v1IBp3Y3rpkWmWzIlkYju,!!De Repente!!,Fuera De Lugar,['Rosendo'],0.659,0.893,5,-8.531,1,0.0411,0.0922,1.9e-05,0.0534,0.951,123.6,199827.0,Continuamente se extraña la gente si no puede ...


In [6]:
columns = df.columns

print('As features do dataset são:')
for(i, col) in enumerate(columns):
    print(f'{i+1}. {col}')

As features do dataset são:
1. id
2. name
3. album_name
4. artists
5. danceability
6. energy
7. key
8. loudness
9. mode
10. speechiness
11. acousticness
12. instrumentalness
13. liveness
14. valence
15. tempo
16. duration_ms
17. lyrics


# Limpeza do dados

## Análise de nulos

In [7]:
def verificar_nulos(dataframe):
    nulos = dataframe.isnull().sum()
    total = dataframe.shape[0]
    porcentagem = (nulos / total) * 100
    resultado = pd.DataFrame({'Nulos': nulos, 'Porcentagem (%)': porcentagem})
    return resultado

In [8]:
verificar_nulos(df)

Unnamed: 0,Nulos,Porcentagem (%)
id,0,0.0
name,11,0.001151
album_name,569763,59.641063
artists,2,0.000209
danceability,0,0.0
energy,0,0.0
key,0,0.0
loudness,0,0.0
mode,0,0.0
speechiness,0,0.0


## Análise descritiva

In [9]:
df.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0
mean,0.55071,0.652442,-7.833732,0.083638,0.282962,0.081875,0.22019,0.488119,122.226093,234144.1
std,0.169784,0.238824,3.792018,0.092929,0.3118,0.212789,0.195938,0.251468,29.536303,90683.68
min,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,1586.0
25%,0.436,0.482,-9.75,0.0345,0.0119,0.0,0.0989,0.282,99.021,184933.0
50%,0.558,0.687,-7.041,0.0478,0.142,3.9e-05,0.137,0.477,120.661,221307.0
75%,0.675,0.857,-5.148,0.087625,0.518,0.00866,0.285,0.69,140.094,265640.0
max,0.993,1.0,4.882,0.966,0.996,1.0,1.0,1.0,246.13,5764624.0


# Gerar o dataframe

In [10]:
df.to_csv('../datasets/data_clean.csv', index=False, encoding='utf-8')

# Resultados

Aparentemente os dados desse dataset de 2000 músicas está tratado. Foi encontrado apenas uma inconsistência de algumas musicas possirem o gênero "set()", que muito provavelmente foi erro de input.