In [1]:
# Install required packages
!pip install -q kagglehub pandas numpy matplotlib seaborn yellowbrick plotly scikit-learn

# Import necessary libraries
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load the dataset using kagglehub
try:
    df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "tonygordonjr/spotify-dataset-2023",
        "spotify_data_12_20_2023.csv"
    )
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    import sys
    sys.exit(1)

# Selecting relevant features
df = df[['track_name', 'artists', 'artist_genres', 'explicit','album_name', 'release_year', 'danceability',
           'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness',
           'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]

df['artists'] = df['artists'].str.replace(r"['\[\]]", '', regex=True)
# Processing artists genres to fit the purpose of the project
import ast

list_of_lists = [ast.literal_eval(s) for s in df['artist_genres']]
df['artist_genres'] = [sublist[0] if sublist else None for sublist in list_of_lists]
df['artist_genres'].fillna("unknown", inplace=True)
# Renaming column
df.rename(columns={'track_name':'name'}, inplace=True)
df.info() # Summary of DataFrame information

print('\nNumber of unique values in each column')
for i in df.columns:
    print(f'{i} - {df[i].nunique()}')

print('\nNumber of missing values in each column\n', df.isnull().sum())

print('\nNumber of duplicated rows\n', df.duplicated().sum())
# Dropping rows with empty values
df = df.dropna()
print('Length of the dataset:', len(df))
# Setting value type to integer
df['explicit'] = df['explicit'].astype(int)
df['release_year'] = df['release_year'].astype(int)
# Applying Label Encoder to artist genre values
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['artist_genres_encoded'] = label_encoder.fit_transform(df['artist_genres'])
df.describe()
# Reseting dataframe index
df.reset_index(drop=True, inplace=True)
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8-bright')
colors = sns.color_palette('bright')

fig, axes = plt.subplots(1, 4, figsize=(15, 4))  # 1 row, 4 columns

for i, name in enumerate(['explicit', 'key', 'mode', 'time_signature']):
    axes[i].pie(df[name].value_counts(), labels=df[name].unique(), autopct='%1.1f%%', startangle=45)
    axes[i].set_title(f'{name}')

# Display the plot
plt.tight_layout()
plt.show()
# #Top 10 genres based on frequency
# plt.figure(figsize=(15, 4))
# df['artist_genres'].value_counts().head(10).iloc[1:].plot(kind='barh',title='Top genres')
# plt.ylabel('Music Genres')
# plt.grid(True)
# plt.show()

# #Top 10 artists based on frequency
# plt.figure(figsize=(15, 4))
# df['artists'].value_counts().head(10).plot(kind='barh',title='Top artists')
# plt.ylabel('Artists')
# plt.grid(True)
# plt.show()

# Histogram Analysis of Continuous Variables
for i in df.drop(['explicit', 'key', 'mode', 'time_signature', 'name', 'artists', 'album_name', 'artist_genres', 'artist_genres_encoded'],axis=1):
    fig, ax = plt.subplots(figsize=(15, 4))
    fig = sns.histplot(data=df, x=i, bins=50, kde=True)
    fig.set_title(f'{i} Count')
    fig.grid(True)
    plt.show()

# Correlation matrix
import numpy as np

plt.figure(figsize=(15,6))
plt.imshow(df[df.select_dtypes(np.number).columns].corr(), vmin=-1, vmax=1)
plt.xticks(range(len(df.select_dtypes(np.number).columns)), df.select_dtypes(np.number).columns, rotation=45, ha='right')
plt.yticks(range(len(df.select_dtypes(np.number).columns)), df.select_dtypes(np.number).columns)
plt.title('Correlation matrix')
plt.grid(True)
plt.colorbar()

plt.show()
# Selecting training features (numerate columns)
features = df.select_dtypes(np.number).columns
# Rescaling data using Standard Scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

df_scaled = pd.DataFrame(df_scaled, columns=features)
df_scaled.head()
# Elbow Method to determine the number of clusters to be formed
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

plt.figure(figsize=(15,5))
Elbow_M = KElbowVisualizer(KMeans(n_init='auto'), k=20)
Elbow_M.fit(df_scaled)
Elbow_M.show()
# Using KMeans model 11 clusters
model = KMeans(n_clusters=11, n_init='auto')
model.fit(df_scaled)

df_scaled["Cluster"] = model.labels_
# Visualizing the Clusters with PCA
import plotly.express as px
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
embedding = pca.fit_transform(df_scaled)
projection = pd.DataFrame(columns=['x', 'y'], data=embedding)
projection['title'] = df['name']
projection['genre'] = df['artist_genres']
projection['cluster'] = df_scaled['Cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genre'])
fig.show()
# Re-adding string features (Name, Artist, Genre, Album)
df_scaled = df_scaled.join(df[df.drop(features, axis=1).columns])
# Re-adding Year feature
df_scaled['release_year'] = df['release_year']
df_scaled.head()

  from .autonotebook import tqdm as notebook_tqdm


Error loading dataset: Error reading file: 'utf-8' codec can't decode byte 0xc0 in position 10: invalid start byte


SystemExit: 1

In [None]:
from sklearn.neighbors import NearestNeighbors

# Selecting only numerical features for KNN
numerical_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                      'speechiness', 'acousticness', 'instrumentalness', 
                      'liveness', 'valence', 'tempo', 'time_signature']

# Train KNN on scaled data
knn = NearestNeighbors(n_neighbors=6, metric='euclidean')
knn.fit(df_scaled[numerical_features])

def recommend_songs(song_name, df, df_scaled, knn, numerical_features):
    """
    Recommend similar songs based on the input song name.
    """
    # Find the song in the dataset
    song_index = df[df['name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset. Please try another."

    song_index = song_index[0]
    
    # Get song features
    song_features = df_scaled.iloc[song_index][numerical_features].values.reshape(1, -1)
    
    # Find nearest neighbors
    distances, indices = knn.kneighbors(song_features)
    
    # Get recommended songs (excluding the first one as it's the input song itself)
    recommended_songs = df.iloc[indices[0][1:]][['name',     'artist_genres']]
    
    return recommended_songs

# Example usage:
song_input = "Don'T Call"  # Replace with your desired song
recommendations = recommend_songs(song_input, df, df_scaled, knn, numerical_features)
print(recommendations)
