# ConfiguraÃ§Ãµes

## ImportaÃ§Ãµes

In [34]:
import os

import pandas as pd

from sklearn.preprocessing import StandardScaler
import joblib

## DefiniÃ§Ã£o do Dataframe

In [35]:
df = pd.read_csv("../datasets/data_clean.csv")

In [36]:
df.columns

Index(['id', 'name', 'artists', 'duration_ms', 'year', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'speechiness', 'valence',
       'popularity', 'explicit'],
      dtype='object')

# PrÃ©-processamento

## SeleÃ§Ã£o de Features

In [37]:
df_selecao = df.copy()

In [38]:
features = [
    "year",
    "popularity",
    "acousticness",
    "danceability",
    "energy",
    "valence",
    "explicit",
]

features_normalized = ["acousticness", "danceability", "energy", "valence"]

In [39]:
df_features = df_selecao.copy()

## NormalizaÃ§Ã£o

In [40]:
print("=" * 90)
print("ðŸ”§ CONSTRUINDO PIPELINE DE NORMALIZAÃ‡ÃƒO")
print("=" * 90)

scaler_reduced = StandardScaler()
scaler_reduced.fit(df_features[features_normalized])

df_features_normalized_reduced_scaled = pd.DataFrame(
    scaler_reduced.transform(df_features[features_normalized]),
    columns=features_normalized,
)

print("âœ… Scaler criado e treinado")
print(f"   Features: {features_normalized}")
print(f"   Shape: {df_features_normalized_reduced_scaled.shape}")
print("\n   MÃ©dia (deve estar prÃ³xima a 0):")
print(df_features_normalized_reduced_scaled.mean())
print("\n   Desvio padrÃ£o (deve estar prÃ³ximo a 1):")
print(df_features_normalized_reduced_scaled.std())

print("\n" + "=" * 90)

ðŸ”§ CONSTRUINDO PIPELINE DE NORMALIZAÃ‡ÃƒO
âœ… Scaler criado e treinado
   Features: ['acousticness', 'danceability', 'energy', 'valence']
   Shape: (169907, 4)

   MÃ©dia (deve estar prÃ³xima a 0):
acousticness   -1.391753e-16
danceability    3.104680e-16
energy          1.498811e-16
valence         1.927043e-16
dtype: float64

   Desvio padrÃ£o (deve estar prÃ³ximo a 1):
acousticness    1.000003
danceability    1.000003
energy          1.000003
valence         1.000003
dtype: float64



## BinarizaÃ§Ã£o

In [41]:
print("=" * 90)
print("ðŸ”„ APLICANDO BINARIZAÃ‡ÃƒO DE POPULARITY")
print("=" * 90)

POPULARITY_THRESHOLD = 33

print(f"\nThreshold utilizado: {POPULARITY_THRESHOLD}")

print("\nðŸ“Š BinarizaÃ§Ã£o: df_features['popularity']")
print("-" * 90)

is_popular = (df_features["popularity"] > POPULARITY_THRESHOLD).astype(int)

binary_distribution = is_popular.value_counts().sort_index()
binary_pcts = is_popular.value_counts(normalize=True).sort_index() * 100

print("âœ… BinarizaÃ§Ã£o aplicada")
print(
    f"   â†’ Classe 0 (â‰¤ {POPULARITY_THRESHOLD}): {binary_distribution[0]} ({binary_pcts[0]:.2f}%)"
)
print(
    f"   â†’ Classe 1 (> {POPULARITY_THRESHOLD}): {binary_distribution[1]} ({binary_pcts[1]:.2f}%)"
)
print(f"   â†’ Total: {len(is_popular)} registros")

df_popularity_binary = pd.DataFrame({"is_popular": is_popular})

print("   â†’ Dataframe final: apenas coluna 'is_popular'")
print(f"   â†’ Shape: {df_popularity_binary.shape}")

print("\n" + "=" * 90)

ðŸ”„ APLICANDO BINARIZAÃ‡ÃƒO DE POPULARITY

Threshold utilizado: 33

ðŸ“Š BinarizaÃ§Ã£o: df_features['popularity']
------------------------------------------------------------------------------------------
âœ… BinarizaÃ§Ã£o aplicada
   â†’ Classe 0 (â‰¤ 33): 84964 (50.01%)
   â†’ Classe 1 (> 33): 84943 (49.99%)
   â†’ Total: 169907 registros
   â†’ Dataframe final: apenas coluna 'is_popular'
   â†’ Shape: (169907, 1)



In [42]:
print("=" * 90)
print("ðŸŽµ CRIANDO FEATURES DECADES COM ONE-HOT ENCODING")
print("=" * 90)

# Criar coluna 'decade' a partir de 'year'
df_decades = df_features.copy()
df_decades["decade"] = (df_decades["year"] // 10 * 10).astype(int)

print("\nðŸ“Š ExtraÃ§Ã£o de dÃ©cadas")
print("-" * 90)
print(f"   DÃ©cadas encontradas no dataset: {sorted(df_decades['decade'].unique())}")
print(f"   Intervalo: {df_decades['decade'].min()} a {df_decades['decade'].max()}")

# Definir todas as dÃ©cadas de 1920 a 2025
# 1920-1930, 1930-1940, ..., 2020-2030
all_decades = list(range(1920, 2030, 10))

print(f"\nðŸ“Œ DÃ©cadas para One-Hot Encoding: {all_decades}")

# Criar dataframe com One-Hot Encoding
decades_ohe = pd.DataFrame()

for decade in all_decades:
    decade_label = f"{decade}s"
    decades_ohe[decade_label] = (
        (df_decades["decade"] >= decade) & (df_decades["decade"] < decade + 10)
    ).astype(int)

print("\nâœ… One-Hot Encoding aplicado")
print(f"   Shape: {decades_ohe.shape}")
print(f"   Features criadas: {list(decades_ohe.columns)}")

print("\nðŸ“Š DistribuiÃ§Ã£o de dÃ©cadas:")
for col in decades_ohe.columns:
    count = decades_ohe[col].sum()
    pct = (count / len(decades_ohe)) * 100
    print(f"   {col}: {int(count)} ({pct:.2f}%)")

print("\n" + "=" * 90)

ðŸŽµ CRIANDO FEATURES DECADES COM ONE-HOT ENCODING

ðŸ“Š ExtraÃ§Ã£o de dÃ©cadas
------------------------------------------------------------------------------------------
   DÃ©cadas encontradas no dataset: [np.int64(1920), np.int64(1930), np.int64(1940), np.int64(1950), np.int64(1960), np.int64(1970), np.int64(1980), np.int64(1990), np.int64(2000), np.int64(2010), np.int64(2020)]
   Intervalo: 1920 a 2020

ðŸ“Œ DÃ©cadas para One-Hot Encoding: [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]

âœ… One-Hot Encoding aplicado
   Shape: (169907, 11)
   Features criadas: ['1920s', '1930s', '1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s', '2020s']

ðŸ“Š DistribuiÃ§Ã£o de dÃ©cadas:
   1920s: 4446 (2.62%)
   1930s: 8889 (5.23%)
   1940s: 14968 (8.81%)
   1950s: 19950 (11.74%)
   1960s: 20000 (11.77%)
   1970s: 19998 (11.77%)
   1980s: 20000 (11.77%)
   1990s: 20000 (11.77%)
   2000s: 20000 (11.77%)
   2010s: 19900 (11.71%)
   2020s: 1756 (1.03%)



## Mesclagem de dataframes

In [43]:
print("=" * 90)
print("ðŸ”— MESCLANDO DATAFRAMES")
print("=" * 90)

print("\nðŸ“Š Criando dataframes finais")
print("-" * 90)

df_temp_full = df.drop(["year", "popularity"], axis=1).copy()
df_temp_full[features_normalized] = df_features_normalized_reduced_scaled[
    features_normalized
].values

df_selecao_normalized_with_decades = pd.concat(
    [df_temp_full, df_popularity_binary, decades_ohe], axis=1
)

print(
    f"\nðŸ”¹ df_selecao_normalized_with_decades: {df_selecao_normalized_with_decades.shape}"
)
print("   â†’ Todas as colunas originais (exceto year e popularity)")
print(f"   â†’ Features normalizadas: {features_normalized}")
print("   â†’ Coluna binarizada: is_popular")
print(f"   â†’ Features decades (OHE): {len(decades_ohe.columns)}")

explicit_col = df_features[["explicit"]].reset_index(drop=True)

df_features_normalized_with_decades = pd.concat(
    [
        df_features_normalized_reduced_scaled.reset_index(drop=True),
        df_popularity_binary.reset_index(drop=True),
        explicit_col,
        decades_ohe.reset_index(drop=True)
    ],
    axis=1
)

print(f"\nðŸ”¹ df_features_normalized_with_decades: {df_features_normalized_with_decades.shape}")
print(f"   â†’ Features normalizadas: {features_normalized}")
print("   â†’ Coluna binarizada: is_popular")
print("   â†’ Coluna explÃ­cita: explicit")
print(f"   â†’ Features decades (OHE): {len(decades_ohe.columns)}")

ðŸ”— MESCLANDO DATAFRAMES

ðŸ“Š Criando dataframes finais
------------------------------------------------------------------------------------------

ðŸ”¹ df_selecao_normalized_with_decades: (169907, 23)


   â†’ Todas as colunas originais (exceto year e popularity)
   â†’ Features normalizadas: ['acousticness', 'danceability', 'energy', 'valence']
   â†’ Coluna binarizada: is_popular
   â†’ Features decades (OHE): 11

ðŸ”¹ df_features_normalized_with_decades: (169907, 17)
   â†’ Features normalizadas: ['acousticness', 'danceability', 'energy', 'valence']
   â†’ Coluna binarizada: is_popular
   â†’ Coluna explÃ­cita: explicit
   â†’ Features decades (OHE): 11


# Gerar o dataframe

In [44]:
print("=" * 90)
print("ðŸ’¾ EXPORTANDO DATAFRAMES FINAIS")
print("=" * 90)

os.makedirs("../datasets", exist_ok=True)
os.makedirs("../models", exist_ok=True)

df_selecao_normalized_with_decades.to_csv(
    "../datasets/selecao_normalized_with_decades.csv", index=False, encoding="utf-8"
)
print("\nâœ… selecao_normalized_with_decades.csv")
print(
    f"   Shape: {df_selecao_normalized_with_decades.shape[0]} linhas Ã— {df_selecao_normalized_with_decades.shape[1]} colunas"
)

df_features_normalized_with_decades.to_csv(
    "../datasets/features_normalized_with_decades.csv", index=False, encoding="utf-8"
)
print("\nâœ… features_normalized_with_decades.csv")
print(
    f"   Shape: {df_features_normalized_with_decades.shape[0]} linhas Ã— {df_features_normalized_with_decades.shape[1]} colunas"
)

joblib.dump(scaler_reduced, "../models/scaler.joblib")
print("\nâœ… scaler.joblib")
print(f"   Scaler salvo com as features: {features_normalized}")

print("\n" + "=" * 90)

ðŸ’¾ EXPORTANDO DATAFRAMES FINAIS



âœ… selecao_normalized_with_decades.csv
   Shape: 169907 linhas Ã— 23 colunas

âœ… features_normalized_with_decades.csv
   Shape: 169907 linhas Ã— 17 colunas

âœ… scaler.joblib
   Scaler salvo com as features: ['acousticness', 'danceability', 'energy', 'valence']



# Resultados

Basicamente, normalizei as 4 features de Ã¡udio (acousticness, danceability, energy, valence) usando StandardScaler pra padronizar os valores.

Depois transformei a coluna de popularity em algo binÃ¡rio (is_popular), usando 33 como threshold, valor este obtido na anÃ¡lise exploratÃ³ria.

No final ficou com 12 features: as 4 normalizadas + 1 binÃ¡ria de popularity + 1 binÃ¡ria de explicit + 6 features de dÃ©cada (one-hot encoding).

Exportei tudo em 3 arquivos: 2 CSVs (um completo com todos os dados, outro sÃ³ com as features) e o scaler em joblib pra usar depois.