<a href="https://www.kaggle.com/code/suyashthakur08/song-recommender?scriptVersionId=202626984" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Importing Data**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

# **Reading Data**

In [None]:
data = pd.read_csv("../input/spotify-dataset/data/data.csv")
genre_data = pd.read_csv('../input/spotify-dataset/data/data_by_genres.csv')
year_data = pd.read_csv('../input/spotify-dataset/data/data_by_year.csv')

# **Data Cleaning**

In [None]:
def preprocess_data(df):
# Data cleaning, missing value handling, etc.
    df_cleaned = df.dropna()  # Dropping missing values
    return df_cleaned

data=preprocess_data(data)

genre_data=preprocess_data(genre_data)

year_data=preprocess_data(year_data)

In [None]:
print(data.info())

In [None]:
data.head()

In [None]:
print(genre_data.info())

In [None]:
genre_data.head()

In [None]:
print(year_data.info())

In [None]:
year_data.head()

In [None]:
# from yellowbrick.target import FeatureCorrelation

# feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']

# X, y = data[feature_names], data['popularity']

# # Create a list of the feature names
# features = np.array(feature_names)

# # Instantiate the visualizer
# visualizer = FeatureCorrelation(labels=features)

# plt.rcParams['figure.figsize']=(20,20)
# visualizer.fit(X, y)     # Fit the data to the visualizer
# visualizer.show()

# **Data Understanding by Visualization and EDA**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Let's assume 'data' is your DataFrame containing the features and 'popularity' as the target
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']

X = data[feature_names]
y = data['popularity']

# Combine X and y into a single DataFrame to compute correlations
df = pd.concat([X, y], axis=1)

# Compute the correlation matrix
correlation_matrix = df.corr()

# Plot using Seaborn's heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

plt.title('Feature Correlation with Seaborn')
plt.show()

# Correlation With Popularity

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Define the feature names
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']

# Extract the features and target
X = data[feature_names]
y = data['popularity']

# Compute correlations between each feature and the target 'popularity'
correlations = X.corrwith(y)

# Convert correlations to a DataFrame for plotting
corr_df = pd.DataFrame({'Feature': feature_names, 'Correlation': correlations})

# Sort by the absolute value of the correlation for better visualization
corr_df = corr_df.reindex(corr_df['Correlation'].sort_values(ascending=False).index)

# Plot using Seaborn's barplot
plt.figure(figsize=(12, 8))
sns.barplot(x='Correlation', y='Feature', data=corr_df, palette='coolwarm')

plt.title('Feature Correlation with Popularity')
plt.show()

# Music Over Time

In [None]:
def get_decade(year):
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    return decade

data['decade'] = data['year'].apply(get_decade)

sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(x='decade', data=data)
plt.title('Count of Songs per Decade')

In [None]:
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=sound_features)
fig.show()

# Characteristics of Top 10 Genres

In [None]:
top10_genres = genre_data.nlargest(10, 'popularity')

fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()

# **Clustering Genres with K-Means**

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [None]:
# Visualizing the Clusters with t-SNE

from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

# **Clustering Songs with K-Means**

In [None]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=20, verbose=False))], verbose=False)
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

In [None]:
# Visualizing the Clusters with PCA

from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

# **Build Recommender System**

In [None]:
!pip install spotipy

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("SPOTIFY_CLIENT_ID")
secret_value_1 = user_secrets.get_secret("SPOTIFY_CLIENT_SECRET")
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secret_value_0,
                                                           client_secret=secret_value_1))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [None]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        #song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0]
        
        #For recommending songs with name and year.
        song_data = spotify_data[(spotify_data['name'] == song['name']) & (spotify_data['year'] == song['year'])].iloc[0]
        
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your original dataset with the features you want to cluster
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)  # Scale your features

# Apply PCA to reduce dimensions (optional, but useful for high-dimensional data)
pca = PCA(n_components=5)
df_pca = pca.fit_transform(df_scaled)

# **Silhouette Score**

In [None]:
from sklearn.metrics import silhouette_score

# Assuming 'df_pca' is the PCA-transformed data
kmeans = KMeans(n_clusters=5)  # Create an instance of KMeans
kmeans.fit(df_pca)             # Fit the model to your data

# Get the cluster labels for each song
labels = kmeans.labels_  

# Calculate silhouette score
sil_score = silhouette_score(df_pca, labels)

print(f'Silhouette Score: {sil_score}')

# **Davies Bouldin Score**

In [None]:
from sklearn.metrics import davies_bouldin_score

db_score = davies_bouldin_score(df_pca, labels)
print(f'Davies-Bouldin Index: {db_score}')

# **Song Recommendation**

In [None]:
recommend_songs([{'name': 'Gods Plan', 'year':2018}],  data)