# Load the Data

In [None]:
import pandas as pd
from google.oauth2 import service_account

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report, accuracy_score


import pyarrow.parquet as pq


In [None]:
from google.colab import auth

# Will collect your credentials
auth.authenticate_user()

In [None]:
# Query Bigquery
query = "SELECT * FROM `music-recommendation-system-24.30k_songlist.spotify_songs_cleaned_V1_csv`"
project = "music-recommendation-system-24"

df = pd.read_gbq(query=query, project_id=project)

  df = pd.read_gbq(query=query, project_id=project)


In [None]:
df.shape

(32833, 23)

# Clean the Data

In [None]:
df.isna().sum()

Unnamed: 0,0
track_id,0
track_name,0
track_artist,0
track_popularity,0
track_album_id,0
track_album_name,0
track_album_release_year,0
playlist_name,0
playlist_id,0
playlist_genre,0


In [None]:
df = df.dropna(subset=['track_name'])

In [None]:
df = df.dropna(subset=['track_artist'])

In [None]:
df = df.dropna(subset=['track_album_name'])

In [None]:
df.shape

(32833, 23)

In [None]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
32828,False
32829,False
32830,False
32831,False


In [None]:
for feature in ["valence", "duration_ms", "loudness", "danceability", "energy", "acousticness", "instrumentalness", "liveness", "speechiness"]:
    # Convert column to string, then remove '%' and replace commas with dots
    df[feature] = df[feature].astype(str).str.replace('%', '').str.replace(',', '.')

    # Convert to float
    df[feature] = pd.to_numeric(df[feature], errors='coerce')

# Exploring the Data

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32833 non-null  object 
 2   track_artist              32833 non-null  object 
 3   track_popularity          32833 non-null  Int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32833 non-null  object 
 6   track_album_release_year  32833 non-null  Int64  
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key_name                  32833 non-null  object 
 14  loudne

In [None]:
print(df.columns)

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_year',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key_name', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


# Preprocessing the Data

In [None]:
#Min-Max Scaler is often preferred over the Standard Scaler for distance-based machine learning models like K-NN (K-Nearest Neighbors) because it preserves the original scale of the features and ensures that all features are on the same range (typically between 0 and 1)
scaler = MinMaxScaler()
numeric_columns = ['track_popularity','track_album_release_year', 'danceability','energy','loudness', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'duration_ms','valence','tempo']
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
df[numeric_columns].head()

Unnamed: 0,track_popularity,track_album_release_year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,duration_ms,valence,tempo
0,0.64,0.984127,0.587996,0.785957,0.875532,0.141612,0.2334,0.000107,0.12249,0.341121,0.350151,0.443515
1,0.67,0.984127,0.45473,0.683937,0.874358,0.098039,0.176056,0.0,0.47992,0.332944,0.440969,0.527197
2,0.81,0.968254,0.704985,0.574915,0.826645,0.087255,0.376258,0.0,0.253012,0.403037,0.496468,0.493724
3,0.56,0.984127,0.649034,0.817964,0.878633,0.054466,0.131791,1.8e-05,0.10743,0.318925,0.493441,0.527197
4,0.7,0.984127,0.803662,0.745949,0.848396,0.054684,0.242455,0.0,0.317269,0.405374,0.542886,0.523013


# Build and Train K-NN Model

In [None]:
X = df[numeric_columns]  # Features
y = df['track_popularity']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(26266, 12) (6567, 12) (26266,) (6567,)


In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Initialize the K-NN Regressor
knn = KNeighborsRegressor(n_neighbors=10)

# Train the model
knn.fit(X_train, y_train)



# Evaluate the Model





In [None]:
# Evaluate the model
score = knn.score(X_test, y_test)
print(f'R-squared: {score:.2f}')

R-squared: 0.98


# Use Model for Recommendations

In [None]:
def recommend_songs(song_index, df, knn, numeric_columns, n_neighbors=20):
    # Ensure the song_index is within the bounds of the DataFrame
    if song_index < 0 or song_index >= len(df):
        print("Error: song_index is out of bounds")
        return None

    # Get the features of the chosen song
    song_features = df.iloc[song_index][numeric_columns].values.reshape(1, -1)

    # Find the nearest neighbors (including the song itself)
    neighbors = knn.kneighbors(song_features, n_neighbors=n_neighbors + 1)  # n_neighbors + 1 to include the song itself

    # Get the indices of the similar songs (excluding the first one, which is the song itself)
    neighbor_indices = neighbors[1][0][1:]

    # Get the names of the recommended songs
    recommended_songs = df.iloc[neighbor_indices][['track_name', 'track_artist']]

    return recommended_songs

def get_song_index_by_name_and_artist(song_name, artist_name, df):
    # Search for the song in the DataFrame by both song name and artist name
    song_row = df[(df['track_name'].str.contains(song_name, case=False, na=False)) &
                  (df['track_artist'].str.contains(artist_name, case=False, na=False))]

    if song_row.empty:
        print("Error: Song or artist not found in the database")
        return None
    else:
        return song_row.index[0]  # Return the first match index

# Example usage:
song_name = input("Enter the name of the song: ")  # Prompt the user to enter the song name
artist_name = input("Enter the name of the artist: ")  # Prompt the user to enter the artist name

# Get the index of the song by name and artist
song_index = get_song_index_by_name_and_artist(song_name, artist_name, df)

if song_index is not None:
    recommended_songs = recommend_songs(song_index, df, knn, numeric_columns, n_neighbors=20)
    if recommended_songs is not None:
        print("Recommended songs:")
        print(recommended_songs)


Enter the name of the song: Foolish
Enter the name of the artist: Ashanti
Recommended songs:
                             track_name      track_artist
19083                        This Place     Logan Pollard
6019                      Crazy In Love   Anthony Dircson
12267  Lonely Together (feat. Rita Ora)            Avicii
5423                              Yummy     Justin Bieber
3118                        Death Route  Sidhu Moose Wala
20179                Smile - EP Version     Janelle Monáe
3619                Insane in the Brain      Cypress Hill
15561             You're The Reason Why      Aaron Taylor
7343                          Price Tag          Jessie J
5201                            Emotion     Mia Rodriguez
18287                        Ottolenghi      Loyle Carner
686                   All Day And Night         Jax Jones
10706                    Thick And Thin              LANY
15903                        Sweatshirt          X Lovers
21895                              Bu

