# Configurações

## Importações

In [9]:
import pandas as pd

## Definição do Dataframe

In [10]:
import requests
import zipfile
import os
from pathlib import Path

# URL do dataset
url = "https://www.kaggle.com/api/v1/datasets/download/abdelrahman16/spotify-analysis-and-visualization"

# Criar diretório datasets se não existir
datasets_dir = Path("../datasets")
datasets_dir.mkdir(exist_ok=True)

# Caminho temporário para salvar o arquivo zip
temp_zip_path = Path("/tmp/spotify-analysis-and-visualization.zip")

print("Iniciando download do dataset...")

try:
    # Fazer o download
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Levanta exceção se houver erro HTTP
    
    # Salvar o arquivo zip temporariamente
    with open(temp_zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    print(f"Download concluído!")
    
    # Extrair o arquivo zip
    print("Extraindo arquivos...")
    with zipfile.ZipFile(temp_zip_path, 'r') as zip_ref:
        # Listar arquivos no zip
        file_list = zip_ref.namelist()
        print(f"Arquivos encontrados no ZIP: {file_list}")
        
        # Extrair todos os arquivos
        zip_ref.extractall(datasets_dir)
    
    print("Extração concluída!")
    
    # Procurar por arquivos CSV e renomear o principal
    csv_files = list(datasets_dir.glob("*.csv"))
    if csv_files:
        # Assumindo que queremos o primeiro arquivo CSV encontrado
        original_csv = csv_files[0]
        new_csv_path = datasets_dir / "spotify_top_2000_songs.csv"
        
        # Renomear o arquivo se necessário
        if original_csv.name != "spotify_top_2000_songs.csv":
            original_csv.rename(new_csv_path)
            print(f"Arquivo renomeado de '{original_csv.name}' para 'spotify_top_2000_songs.csv'")
        else:
            print("Arquivo já possui o nome correto: 'spotify_top_2000_songs.csv'")
    
    # Remover o arquivo zip temporário
    temp_zip_path.unlink()
    print("Arquivo ZIP temporário removido.")
    
    # Listar arquivos finais na pasta datasets
    print("\nArquivos na pasta datasets:")
    for file in datasets_dir.iterdir():
        if file.is_file():
            print(f"- {file.name}")
        
except requests.exceptions.RequestException as e:
    print(f"Erro ao fazer download: {e}")
except zipfile.BadZipFile as e:
    print(f"Erro ao extrair arquivo zip: {e}")
except Exception as e:
    print(f"Erro inesperado: {e}")
finally:
    # Garantir que o arquivo temporário seja removido mesmo em caso de erro
    if temp_zip_path.exists():
        temp_zip_path.unlink()
        print("Arquivo ZIP temporário removido (cleanup).")

Iniciando download do dataset...
Download concluído!
Extraindo arquivos...
Arquivos encontrados no ZIP: ['songs_normalize.csv']
Extração concluída!
Arquivo já possui o nome correto: 'spotify_top_2000_songs.csv'
Arquivo ZIP temporário removido.

Arquivos na pasta datasets:
- spotify_top_2000_songs.csv
- songs_normalize.csv
Download concluído!
Extraindo arquivos...
Arquivos encontrados no ZIP: ['songs_normalize.csv']
Extração concluída!
Arquivo já possui o nome correto: 'spotify_top_2000_songs.csv'
Arquivo ZIP temporário removido.

Arquivos na pasta datasets:
- spotify_top_2000_songs.csv
- songs_normalize.csv


In [11]:
df = pd.read_csv('../datasets/spotify_top_2000_songs.csv')

# Conhecendo as Features

In [12]:
df.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop


In [13]:
columns = df.columns

print('As features do dataset são:')
for(i, col) in enumerate(columns):
    print(f'{i+1}. {col}')

As features do dataset são:
1. artist
2. song
3. duration_ms
4. explicit
5. year
6. popularity
7. danceability
8. energy
9. key
10. loudness
11. mode
12. speechiness
13. acousticness
14. instrumentalness
15. liveness
16. valence
17. tempo
18. genre


# Limpeza do dados

## Análise de nulos

In [14]:
def verificar_nulos(dataframe):
    nulos = dataframe.isnull().sum()
    total = dataframe.shape[0]
    porcentagem = (nulos / total) * 100
    resultado = pd.DataFrame({'Nulos': nulos, 'Porcentagem (%)': porcentagem})
    return resultado

In [15]:
verificar_nulos(df)

Unnamed: 0,Nulos,Porcentagem (%)
artist,0,0.0
song,0,0.0
duration_ms,0,0.0
explicit,0,0.0
year,0,0.0
popularity,0,0.0
danceability,0,0.0
energy,0,0.0
key,0,0.0
loudness,0,0.0


## Análise descritiva

In [16]:
df.describe()

Unnamed: 0,duration_ms,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,228748.1245,2009.494,59.8725,0.667438,0.720366,5.378,-5.512434,0.5535,0.103568,0.128955,0.015226,0.181216,0.55169,120.122558
std,39136.569008,5.85996,21.335577,0.140416,0.152745,3.615059,1.933482,0.497254,0.096159,0.173346,0.087771,0.140669,0.220864,26.967112
min,113000.0,1998.0,0.0,0.129,0.0549,0.0,-20.514,0.0,0.0232,1.9e-05,0.0,0.0215,0.0381,60.019
25%,203580.0,2004.0,56.0,0.581,0.622,2.0,-6.49025,0.0,0.0396,0.014,0.0,0.0881,0.38675,98.98575
50%,223279.5,2010.0,65.5,0.676,0.736,6.0,-5.285,1.0,0.05985,0.0557,0.0,0.124,0.5575,120.0215
75%,248133.0,2015.0,73.0,0.764,0.839,8.0,-4.16775,1.0,0.129,0.17625,6.8e-05,0.241,0.73,134.2655
max,484146.0,2020.0,89.0,0.975,0.999,11.0,-0.276,1.0,0.576,0.976,0.985,0.853,0.973,210.851


# Gerar o dataframe

In [17]:
df.to_csv('../datasets/data_clean.csv', index=False, encoding='utf-8')

# Resultados

Aparentemente os dados desse dataset de 2000 músicas está tratado. Foi encontrado apenas uma inconsistência de algumas musicas possirem o gênero "set()", que muito provavelmente foi erro de input.