# Configura√ß√µes

## Importa√ß√µes

In [1]:
import os
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import joblib

## Defini√ß√£o do Dataframe

In [2]:
df = pd.read_csv("../datasets/features_normalized_with_decades.csv")

# Modelagem

In [3]:
X_train, X_test = train_test_split(df, test_size=0.3, random_state=42)

model = NearestNeighbors(n_neighbors=100, metric="euclidean", algorithm="kd_tree")
model.fit(X_train)

print(f"Modelo treinado: {len(X_train)} amostras")
print(f"Teste: {len(X_test)} amostras")

Modelo treinado: 118934 amostras
Teste: 50973 amostras


# Avalia√ß√£o do modelo

In [4]:
distances, indices = model.kneighbors(X_test.iloc[:5])

print("=" * 90)
print("üìä AVALIA√á√ÉO DE AMOSTRAS")
print("=" * 90)

for i in range(5):
    print(f"\nüéµ Amostra {i}:")
    print(
        f"   Dist√¢ncias dos 5 vizinhos mais pr√≥ximos: {np.round(distances[i][:5], 4)}"
    )
    print(f"   √çndices: {indices[i][:5]}")

print("\n" + "=" * 90)
print("üìà M√âTRICAS DO MODELO")
print("=" * 90)

print("\n‚è±Ô∏è  TEMPO DE RECOMENDA√á√ÉO")
print("-" * 90)
start_time = time.time()
distances_all, indices_all = model.kneighbors(X_test)
end_time = time.time()

total_time = end_time - start_time
avg_time_per_sample = (total_time / len(X_test)) * 1000
avg_time_per_recommendation = (total_time / (len(X_test) * 20)) * 1000 

print(f"‚úÖ Tempo total de recomenda√ß√£o: {total_time:.4f}s")
print(f"‚úÖ Tempo m√©dio por amostra: {avg_time_per_sample:.4f}ms")
print(f"‚úÖ Tempo m√©dio por recomenda√ß√£o: {avg_time_per_recommendation:.4f}ms")

print("\nüìè DIST√ÇNCIAS EUCLIDIANAS")
print("-" * 90)
print(f"‚úÖ Dist√¢ncia m√©dia: {distances_all.mean():.4f}")
print(f"‚úÖ Dist√¢ncia m√≠nima: {distances_all.min():.4f}")
print(f"‚úÖ Dist√¢ncia m√°xima: {distances_all.max():.4f}")
print(f"‚úÖ Desvio padr√£o: {distances_all.std():.4f}")

print("\nüé≤ SERENDIPIDADE")
print("-" * 90)
variance_per_sample = np.var(distances_all, axis=1)
serendipity_score = variance_per_sample.mean()

print(f"‚úÖ Score de Serendipidade: {serendipity_score:.4f}")
print("   (Quanto maior, mais diversas s√£o as recomenda√ß√µes)")
print(f"   ‚Üí Vari√¢ncia m√©dia das dist√¢ncias por amostra: {serendipity_score:.4f}")
print(f"   ‚Üí Min: {variance_per_sample.min():.4f}, Max: {variance_per_sample.max():.4f}")

median_distance = np.median(distances_all)
close_recommendations = np.sum(distances_all < median_distance) / distances_all.size * 100
far_recommendations = 100 - close_recommendations

print("\n   Distribui√ß√£o de proximidade:")
print(f"   ‚Üí Recomenda√ß√µes pr√≥ximas (< mediana): {close_recommendations:.2f}%")
print(f"   ‚Üí Recomenda√ß√µes distantes (> mediana): {far_recommendations:.2f}%")

print("\nüéØ CAUDA LONGA (Long-Tail)")
print("-" * 90)

X_test_reset = X_test.reset_index(drop=True)

popular_recommendations = 0
rare_recommendations = 0

for recommendation_indices in indices_all:
    for idx in recommendation_indices:
        if idx < len(X_test_reset):
            is_pop = X_test_reset.iloc[idx]['is_popular']
            if is_pop == 1:
                popular_recommendations += 1
            else:
                rare_recommendations += 1

total_recs = popular_recommendations + rare_recommendations
pct_popular = (popular_recommendations / total_recs * 100) if total_recs > 0 else 0
pct_rare = (rare_recommendations / total_recs * 100) if total_recs > 0 else 0

print(f"‚úÖ Recomenda√ß√µes de Cauda Longa (m√∫sicas n√£o-populares): {pct_rare:.2f}%")
print(f"‚úÖ Recomenda√ß√µes Populares: {pct_popular:.2f}%")
print("   ‚Üí Um bom modelo deve ter alto % de cauda longa (descoberta)")
print(f"   ‚Üí Total de recomenda√ß√µes analisadas: {total_recs}")

print("\n" + "=" * 90)
print("üìä RESUMO GERAL")
print("=" * 90)
print(f"‚úÖ Amostras de teste: {len(X_test)}")
print("‚úÖ Vizinhos encontrados por amostra: 20")
print(f"‚úÖ Total de recomenda√ß√µes: {len(X_test) * 20}")
print(f"‚úÖ Tempo total: {total_time:.4f}s")
print(f"‚úÖ Score de Serendipidade: {serendipity_score:.4f}")
print(f"‚úÖ % Cauda Longa: {pct_rare:.2f}%")

üìä AVALIA√á√ÉO DE AMOSTRAS

üéµ Amostra 0:
   Dist√¢ncias dos 5 vizinhos mais pr√≥ximos: [0.1049 0.1212 0.1397 0.1616 0.1971]
   √çndices: [ 82603 105944 117978 101321  82282]

üéµ Amostra 1:
   Dist√¢ncias dos 5 vizinhos mais pr√≥ximos: [0.1898 0.2448 0.2503 0.2518 0.2596]
   √çndices: [80494 57900 39396 88828 13360]

üéµ Amostra 2:
   Dist√¢ncias dos 5 vizinhos mais pr√≥ximos: [0.2257 0.2986 0.3187 0.3428 0.3456]
   √çndices: [38405 24485 99111 30134 50325]

üéµ Amostra 3:
   Dist√¢ncias dos 5 vizinhos mais pr√≥ximos: [0.1393 0.1421 0.156  0.1636 0.1663]
   √çndices: [44466 59319 70010 88130 72590]

üéµ Amostra 4:
   Dist√¢ncias dos 5 vizinhos mais pr√≥ximos: [0.1768 0.2046 0.2153 0.2277 0.2312]
   √çndices: [54353 28463 82308 71324 38513]

üìà M√âTRICAS DO MODELO

‚è±Ô∏è  TEMPO DE RECOMENDA√á√ÉO
------------------------------------------------------------------------------------------
‚úÖ Tempo total de recomenda√ß√£o: 8.3266s
‚úÖ Tempo m√©dio por amostra: 0.1634ms
‚úÖ Tempo

# Gerar o modelo

In [5]:
os.makedirs("../models", exist_ok=True)

joblib.dump(model, "../models/music_recommender_model.joblib")
joblib.dump(df.columns.tolist(), "../models/music_model_features.pkl")

print("Modelo salvo com sucesso")

Modelo salvo com sucesso
