In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import sqlite3
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv('database.csv')
df.head()


Unnamed: 0,name,popularity,duration_ms,explicit,artists,danceability,loudness,speechiness,acousticness,instrumentalness,energy,liveness,valence,tempo,time_signature,year,genres
0,drivers license,99,242014.0,1,Olivia Rodrigo,0.585,-8.761,0.0601,0.721,1.3e-05,0.436,0.105,0.132,143.874,4,2021,"['pop', 'post-teen pop']"
1,Astronaut In The Ocean,98,132780.0,0,Masked Wolf,0.778,-6.865,0.0913,0.175,0.0,0.695,0.15,0.472,149.996,4,2021,['australian hip hop']
2,Save Your Tears,97,215627.0,1,The Weeknd,0.68,-5.487,0.0309,0.0212,1.2e-05,0.826,0.543,0.644,118.051,4,2020,"['canadian contemporary r&b', 'canadian pop', ..."
3,Blinding Lights,96,200040.0,0,The Weeknd,0.514,-5.934,0.0598,0.00146,9.5e-05,0.73,0.0897,0.334,171.005,4,2020,"['canadian contemporary r&b', 'canadian pop', ..."
4,The Business,95,164000.0,0,Tiësto,0.798,-7.079,0.232,0.414,0.0192,0.62,0.112,0.235,120.031,4,2020,"['big room', 'brostep', 'dance pop', 'dutch ed..."


In [3]:
#capture
song_vectorizer = CountVectorizer()
song_vectorizer.fit(df['name'])
df = df.sort_values(by=['popularity'], ascending=False).head(10000)

In [4]:
def get_similarities(song_name, data):
   
  # Getting vector for the input song.
  text_array1 = song_vectorizer.transform(data[data['name']==song_name][['genres', 'artists']].apply(lambda x: ' '.join(x), axis=1)).toarray()

  # Selecting numerical columns for input song.
  num_cols = ['duration_ms', 'explicit', 'danceability', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'energy', 'liveness', 'valence', 'tempo']
  num_array1 = data[data['name']==song_name][num_cols].to_numpy()

  # Initialize sim list
  sim = []

  # Loop over rows in data and calculate similarity
  for idx, row in data.iterrows():
    name = row['name']

    # Getting vector for current song.
    text_array2 = song_vectorizer.transform(data[data['name']==name][['genres', 'artists']].apply(lambda x: ' '.join(x), axis=1)).toarray()

    # Selecting numerical columns forcurrent song.
    num_array2 = data[data['name']==name][num_cols].to_numpy()

    # Calculating similarities for text as well as numeric features
    text_sim = cosine_similarity(text_array1, text_array2)[0][0]
    num_sim = cosine_similarity(num_array1, num_array2)[0][0]

    # Combine text and numeric similarities using weights
    text_weight = 0.3
    num_weight = 0.7
    total_sim = (text_weight * text_sim) + (num_weight * num_sim)

    sim.append(total_sim)

  return sim

In [5]:
def recommend_songs(song_name, data=df):
    try:
        # Base case
        if data[data['name'] == song_name].shape[0] == 0:
            message = 'This song is either not so popular or you have entered an invalid name. Some songs you may like:'
            suggestions = data.sort_values(by=['popularity'], ascending=False).head(5)[['name', 'artists']].apply(tuple, axis=1).tolist()
            return message, suggestions

        data['similarity_factor'] = get_similarities(song_name, data)

        # Filter out songs from the same artist
        input_artist = data[data['name'] == song_name]['artists'].iloc[0]
        data = data[data['artists'] != input_artist]

        data.sort_values(by=['similarity_factor', 'popularity'],
                        ascending=[False, False], inplace=True)
    except Exception as e:
        print(f"This song has no recommendations or is not availiable : {e}")



In [None]:
recommend_songs('Shape of You', data=df)