In [1]:
# General libraries
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import altair as alt
import random

# Metrics libraries
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors

# Clustering libraries
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

# Data procressing libraries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE

from sklearn.model_selection import train_test_split

alt.renderers.enable('kaggle')
warnings.simplefilter("ignore")


In [2]:
# Read Dataset
data = pd.read_csv("final_dataset.csv")

In [3]:
# Drop unnecessary 'target' column
data.drop('mode', axis=1, inplace=True)
# data.drop('track_id', axis=1, inplace=True)
# data.drop('album_id', axis=1, inplace=True)
# data.drop('artist_id', axis=1, inplace=True)

In [4]:
# Categorical data
categorical_data = data.select_dtypes(include=['object'])
print("Categorical columns: ", categorical_data.columns.tolist())

# Numeric data
numeric_data = data.select_dtypes(include=['int64', 'float64'])
print("Numeric columns: ", numeric_data.columns.tolist())


Categorical columns:  ['track_name', 'track_id', 'album_name', 'album_id', 'artist_name', 'artist_id', 'release_date']
Numeric columns:  ['artist_popularity', 'track_popularity', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'energy_danceability', 'acoustic_instrumental_diff', 'mood_index']


In [5]:
# Create new attributes
data['energy_loudness_ratio'] = data['energy'] / data['loudness']
# Append the new features to numeric_data
numeric_data = pd.concat([numeric_data, data[['energy_loudness_ratio']]], axis=1)

data.columns

Index(['track_name', 'track_id', 'album_name', 'album_id', 'artist_name',
       'artist_id', 'artist_popularity', 'track_popularity', 'release_date',
       'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'energy_danceability', 'acoustic_instrumental_diff',
       'mood_index', 'energy_loudness_ratio'],
      dtype='object')

In [6]:
# Continuous Columns
continuous_cols = ['artist_popularity', 'track_popularity', 'danceability', 'energy',
       'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'energy_danceability',
       'acoustic_instrumental_diff', 'mood_index', 'energy_loudness_ratio']
# Discrete Columns
discrete_cols = ['key']

In [7]:
# Perform one-hot encoding
one_hot_encoded_data = pd.get_dummies(data, columns=discrete_cols, drop_first=True)
data = pd.concat([data.drop(columns=discrete_cols), one_hot_encoded_data], axis=1)

# Check for duplicate columns
duplicate_columns = data.columns[data.columns.duplicated()]

# Drop duplicate columns
data = data.loc[:, ~data.columns.duplicated()]

data = data.dropna()

# Numeric Data
numeric_data = data.select_dtypes(include=[np.number])

numeric_data.columns

# Print the DataFrame after dropping duplicate columns
print(data.columns)

Index(['track_name', 'track_id', 'album_name', 'album_id', 'artist_name',
       'artist_id', 'artist_popularity', 'track_popularity', 'release_date',
       'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'energy_danceability', 'acoustic_instrumental_diff', 'mood_index',
       'energy_loudness_ratio', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0',
       'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0',
       'key_11.0'],
      dtype='object')


In [9]:
# MinMaxScaler 객체 생성
# 데이터의 값을 0과 1 사이로 스케일링 수행
scaler = MinMaxScaler()
# 연속형 변수에 스케일링 적용
scaled_features = scaler.fit_transform(numeric_data[continuous_cols])

# 스케일링된 연속형 변수로 새로운 데이터프레임 생성
data[continuous_cols] = pd.DataFrame(scaled_features, columns=data[continuous_cols].columns)

In [10]:
data.columns

Index(['track_name', 'track_id', 'album_name', 'album_id', 'artist_name',
       'artist_id', 'artist_popularity', 'track_popularity', 'release_date',
       'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'energy_danceability', 'acoustic_instrumental_diff', 'mood_index',
       'energy_loudness_ratio', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0',
       'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0',
       'key_11.0'],
      dtype='object')

In [12]:
# track_id 컬럼의 고유 값 출력
track_ids = data['track_id'].unique()

# track_id 컬럼의 값 출력
print(f"track_id 컬럼의 고유 값들: {track_ids}")
print(f"총 {len(track_ids)}개의 track_id가 존재합니다.")

track_id 컬럼의 고유 값들: ['4OkTLeJWGwSTNa8iSqLxzL' '0ByC7DPj6qJK3FfcpNoWWg'
 '3ijbZnKjENuSizMVny9mkm' ... '0bPWgbmEYjTFuZpC1zZZpN'
 '3RU6Ylhg9JmYtzSASm7vAJ' '5nk7jPeoSEpm8MxEemcQUf']
총 27355개의 track_id가 존재합니다.


In [14]:
import numpy as np
import joblib
from tensorflow.keras.models import load_model
from sklearn.metrics.pairwise import cosine_similarity

# 모델을 로드하는 함수
def load_models(encoder_path, kmeans_path):
    # Autoencoder 모델 로드 (컴파일 옵션 제외)
    autoencoder_model = load_model(encoder_path, compile=False)
    
    # KMeans 모델 로드
    kmeans_model = joblib.load(kmeans_path)
    
    return autoencoder_model, kmeans_model

# track_id로 노래 추천을 위한 함수
def recommend_songs_by_track_id(input_track_id, data, numeric_data, top_n=10):
    # 입력된 track_id에 해당하는 노래의 벡터를 가져옴
    input_song_vector = numeric_data[data['track_id'] == input_track_id]
    
    if input_song_vector.shape[0] == 0:
        print(f"No song found with track_id '{input_track_id}'")
        return []
    
    # track_id에 해당하는 track_name 출력
    input_track_name = data[data['track_id'] == input_track_id]['track_name'].values[0]
    print(f"Track ID: {input_track_id}, Track Name: {input_track_name}")
    
    # 차원 축소
    encoded_input_song = encoder_model.predict(input_song_vector)
    
    # 전체 데이터에 대해 인코딩된 벡터 생성
    encoded_data = encoder_model.predict(numeric_data)
    
    # 코사인 유사도 계산
    similarities = cosine_similarity(encoded_input_song, encoded_data)
    
    # 유사도가 높은 순서대로 상위 top_n 추천
    top_n_indices = np.argsort(similarities[0])[-top_n:]
    
    # 관련도 높은 track_id와 track_name 출력
    recommended_tracks = data[['track_id', 'track_name']].iloc[top_n_indices]
    
    print(f"\nTop {top_n} similar songs based on Cosine Similarity:")
    for idx, row in recommended_tracks.iterrows():
        print(f"Track ID: {row['track_id']}, Track Name: {row['track_name']}")
    
    return recommended_tracks

# 모델 로드 (저장된 Autoencoder 모델과 KMeans 모델)
encoder_model, kmeans_model = load_models(
    "./weight_5/autoencoder_model_dims_2048_1024_512_256_128_64_32_16_8_4.keras",  # 저장된 Autoencoder 모델 경로
    "./weight_5/kmeans_model_clusters_15_dims_2048_1024_512_256_128_64_32_16_8_4.pkl"  # 저장된 KMeans 모델 경로
)

# track_id를 입력받아 추천
input_track_id = '4OkTLeJWGwSTNa8iSqLxzL'  # 예시 track_id를 입력
recommend_songs_by_track_id(input_track_id, data, numeric_data, top_n=10)

Track ID: 4OkTLeJWGwSTNa8iSqLxzL, Track Name: Keyboard Sonata in E Minor, Wq. 49/3, H. 33, "Wurttemberg Sonata No. 3": I. Allegro
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step
[1m861/861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step

Top 10 similar songs based on Cosine Similarity:
Track ID: 3dTz4jd3DGuqGLQyPtGyEE, Track Name: DAUM
Track ID: 2zAcVe9SPjEps2M087r9P6, Track Name: Diary
Track ID: 2qTAE4kS2ea1CYF9ND7tMn, Track Name: Late Autumn
Track ID: 7sNPQxJLeBNOQY1pEDZl1K, Track Name: Falling Leaves are Beautiful
Track ID: 6YqsIERz8ZznclqMyhNRiu, Track Name: Sugar Plum Fairy Introlude - Acapella
Track ID: 0VeBp2T7wETkwVyGs6v8pW, Track Name: Joy to the World - Flava Mix
Track ID: 0AOazPAzLjmnQAwspzFqtZ, Track Name: Joy to the World - Celebration Mix
Track ID: 4I5Hn34v6foGP3ta9Xx0rN, Track Name: All I Want for Christmas Is You - Mariah's New Dance Mix Extended 2009
Track ID: 40u3Iw7h7JAkonYNCVe46M, Track Name: Santa Claus Is Comin' to Town - A

Unnamed: 0,track_id,track_name
10473,3dTz4jd3DGuqGLQyPtGyEE,DAUM
10472,2zAcVe9SPjEps2M087r9P6,Diary
10471,2qTAE4kS2ea1CYF9ND7tMn,Late Autumn
10470,7sNPQxJLeBNOQY1pEDZl1K,Falling Leaves are Beautiful
10469,6YqsIERz8ZznclqMyhNRiu,Sugar Plum Fairy Introlude - Acapella
10468,0VeBp2T7wETkwVyGs6v8pW,Joy to the World - Flava Mix
10467,0AOazPAzLjmnQAwspzFqtZ,Joy to the World - Celebration Mix
10466,4I5Hn34v6foGP3ta9Xx0rN,All I Want for Christmas Is You - Mariah's New...
10464,40u3Iw7h7JAkonYNCVe46M,Santa Claus Is Comin' to Town - Anniversary Mix
28940,5nk7jPeoSEpm8MxEemcQUf,On the Edge
