# Configurações

## Importações

In [1]:
import pandas as pd

## Definição do Dataframe

In [2]:
import os
from pathlib import Path
import shutil
import kagglehub

# Configurar o diretório de datasets
datasets_dir = Path("../datasets")
datasets_dir.mkdir(exist_ok=True)

print("Iniciando download do dataset via Kaggle Hub...")

try:
    # Download usando kagglehub
    path = kagglehub.dataset_download("fcpercival/160k-spotify-songs-sorted")
    print(f"Dataset baixado para: {path}")
    
    # Procurar APENAS pelo arquivo data.csv
    source_file = None
    for file in Path(path).glob("*.csv"):
        if file.name == "data.csv":
            source_file = file
            print(f"\nArquivo encontrado: {file.name}")
            break
    
    if source_file:
        # Copiar para a pasta datasets como spotify_songs.csv
        destination_csv = datasets_dir / "spotify_songs.csv"
        shutil.copy(source_file, destination_csv)
        print(f"Arquivo copiado para: {destination_csv.name}")
    else:
        print("❌ Arquivo data.csv não encontrado!")
        print("\nArquivos disponíveis no download:")
        for file in Path(path).glob("*.csv"):
            print(f"- {file.name}")
    
except Exception as e:
    print(f"Erro ao fazer download: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Iniciando download do dataset via Kaggle Hub...
Downloading from https://www.kaggle.com/api/v1/datasets/download/fcpercival/160k-spotify-songs-sorted?dataset_version_number=1...


100%|██████████| 13.5M/13.5M [00:01<00:00, 10.3MB/s]

Extracting files...





Dataset baixado para: /home/dti-dev/.cache/kagglehub/datasets/fcpercival/160k-spotify-songs-sorted/versions/1

Arquivo encontrado: data.csv
Arquivo copiado para: spotify_songs.csv


In [3]:
df = pd.read_csv('../datasets/spotify_songs.csv')

# Conhecendo as Features

In [4]:
df.head()

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,['Love'],220560,11/1/66,1966,0.525,0.6,0.54,0.00305,0.1,-11.803,0.0328,125.898,0.547,1,9,26,0
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",['U2'],157840,2/28/83,1983,0.228,0.368,0.48,0.707,0.159,-11.605,0.0306,150.166,0.338,1,8,21,0
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",['U2'],226200,8/20/83,1983,0.0998,0.272,0.684,0.0145,0.946,-9.728,0.0505,143.079,0.279,1,8,41,0
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,0.183,-9.315,0.0307,150.316,0.31,1,8,37,0
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,0.183,-9.315,0.0307,150.316,0.31,1,8,35,0


In [5]:
columns = df.columns

print('As features do dataset são:')
for(i, col) in enumerate(columns):
    print(f'{i+1}. {col}')

As features do dataset são:
1. id
2. name
3. artists
4. duration_ms
5. release_date
6. year
7. acousticness
8. danceability
9. energy
10. instrumentalness
11. liveness
12. loudness
13. speechiness
14. tempo
15. valence
16. mode
17. key
18. popularity
19. explicit


# Limpeza do dados

## Análise de nulos

In [8]:
def verificar_nulos(dataframe):
    nulos = dataframe.isnull().sum()
    total = dataframe.shape[0]
    porcentagem = (nulos / total) * 100
    resultado = pd.DataFrame({'Nulos': nulos, 'Porcentagem (%)': porcentagem})
    return resultado

In [7]:
verificar_nulos(df)

Unnamed: 0,Nulos,Porcentagem (%)
id,0,0.0
name,0,0.0
artists,0,0.0
duration_ms,0,0.0
release_date,0,0.0
year,0,0.0
acousticness,0,0.0
danceability,0,0.0
energy,0,0.0
instrumentalness,0,0.0


## Análise descritiva

In [9]:
df.describe()

Unnamed: 0,duration_ms,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit
count,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0,169907.0
mean,231407.1,1977.223234,0.493217,0.538147,0.488591,0.161939,0.206692,-11.370311,0.094058,116.94785,0.532091,0.708552,5.200498,31.556681,0.084864
std,121322.3,25.593318,0.376628,0.175345,0.267391,0.30933,0.176797,5.666795,0.149938,30.727079,0.262407,0.454431,3.515272,21.58273,0.27868
min,5108.0,1921.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,171040.0,1957.0,0.0945,0.417,0.263,0.0,0.0984,-14.47,0.0349,93.516,0.322,0.0,2.0,12.0,0.0
50%,208600.0,1978.0,0.492,0.548,0.481,0.000204,0.135,-10.474,0.045,114.777,0.544,1.0,5.0,33.0,0.0
75%,262966.5,1999.0,0.888,0.667,0.71,0.0868,0.263,-7.118,0.0754,135.712,0.749,1.0,8.0,48.0,0.0
max,5403500.0,2020.0,0.996,0.988,1.0,1.0,1.0,3.855,0.969,244.091,1.0,1.0,11.0,100.0,1.0


# Gerar o dataframe

In [10]:
df.to_csv('../datasets/data_clean.csv', index=False, encoding='utf-8')

# Resultados

Aparentemente os dados desse dataset de 2000 músicas está tratado. Foi encontrado apenas uma inconsistência de algumas musicas possirem o gênero "set()", que muito provavelmente foi erro de input.