Music Recommendation System

✅ Step 1: Import Required Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

✅ Step 2: Load the Dataset

In [2]:
df = pd.read_csv('spotify_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


✅ Step 3: Select Important Features for Recommendation

In [3]:
features = ['danceability', 'energy', 'valence', 'tempo', 'loudness',
            'speechiness', 'acousticness', 'instrumentalness', 'liveness']

df_selected = df[features].copy()


✅ Step 4: Handle Missing Values (if any)

In [4]:
df_selected.dropna(inplace=True)

✅ Step 5: Normalize the Features

In [5]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_selected)

✅ Step 6: Compute Similarity Matrix

In [6]:
# Sample only 5000 rows from scaled data
df_sampled = df_scaled[:5000]  # Assuming df_scaled is based on df_selected

from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(df_sampled)

# Get distances and indices for top 6 nearest neighbors
distances, indices = model.kneighbors(df_sampled, n_neighbors=6)

Option 1: Reduce Dataset Size (Recommended for Testing)

In [7]:
# Step 6 - Sample and Compute Similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

features = ['danceability', 'energy', 'valence', 'tempo', 'popularity']  # adjust as per your dataset

df_sample = df.sample(n=20000, random_state=88)  # reduced sample
df_selected = df_sample[features].copy()

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_selected)

similarity_matrix = cosine_similarity(df_scaled)

✅ Step 7: Build Recommendation Function

In [8]:
import numpy as np

# Reset index to align row numbers with song index
df_sample = df_sample.reset_index(drop=True)

def recommend(song_name):
    # Find index of the song
    index = df_sample[df_sample['track_name'].str.lower() == song_name.lower()].index
    
    if len(index) == 0:
        return f"❌ Song '{song_name}' not found in dataset."
    
    index = index[0]

    # Get pairwise similarity scores
    similarity_scores = list(enumerate(similarity_matrix[index]))

    # Sort songs by similarity score (excluding the first one - itself)
    sorted_songs = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:6]

    # Fetch top 5 similar songs
    recommendations = []
    for i in sorted_songs:
        recommended_song = df_sample.iloc[i[0]]['track_name']
        recommendations.append(recommended_song)
    
    return recommendations


✅ Step 8: Test the Recommender

In [9]:
recommend("perfect")

['Weekend!',
 'Us',
 'RADIO SONG (feat. Jeremy McKinnon)',
 'Brighter',
 'Leverage']

In [10]:
df_sample['track_name'].sample(10).tolist()

['Sandakozhi',
 'Development',
 'Sleep 4',
 'Way Beyond - Original',
 '(Is This the Way To) Amarillo',
 'carpool',
 'This Is What It Feels Like - David Guetta Remix',
 'E Samba 2018 - Original Mix',
 'Completou',
 'Shine On']

✅ Step 9: Add Artist or Genre Filter (for more accurate recommendations)

Modify the Recommend Function with Filters

In [11]:
def recommend_with_filters(song_name, filter_by='genre'):
    # Find index of the song
    index = df_sample[df_sample['track_name'].str.lower() == song_name.lower()].index

    if len(index) == 0:
        return f"❌ Song '{song_name}' not found in dataset."
    
    index = index[0]

    # Base song details
    base_song = df_sample.iloc[index]
    base_genre = base_song['genre']
    
    # Get similarity scores
    similarity_scores = list(enumerate(similarity_matrix[index]))

    # Sort songs by similarity score
    sorted_songs = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Filter songs by genre or mood
    recommendations = []
    for i in sorted_songs:
        candidate_song = df_sample.iloc[i[0]]

        # Skip the same song
        if candidate_song['track_name'].lower() == song_name.lower():
            continue

        # Filter by genre
        if filter_by == 'genre':
            if candidate_song['genre'] == base_genre:
                recommendations.append(candidate_song['track_name'])
        
        # Filter by mood (valence, energy, danceability similarity)
        elif filter_by == 'mood':
            mood_features = ['valence', 'energy', 'danceability']
            base_mood = base_song[mood_features].values
            candidate_mood = candidate_song[mood_features].values

            diff = np.linalg.norm(base_mood - candidate_mood)
            if diff < 0.3:  # you can tune this threshold
                recommendations.append(candidate_song['track_name'])

        # Stop when 5 recommendations are collected
        if len(recommendations) == 5:
            break
    
    return recommendations if recommendations else "No similar songs found with the selected filter."


✅ Step 2: Test Genre and Mood Filters

In [12]:
recommend_with_filters("perfect", filter_by='genre')

recommend_with_filters("perfect", filter_by='mood')

['Weekend!',
 'Us',
 'RADIO SONG (feat. Jeremy McKinnon)',
 'Brighter',
 'Leverage']

✅ Bonus Step: Add Artist Filter

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Scale features
features = ['valence', 'energy', 'danceability', 'tempo', 'acousticness', 'instrumentalness', 'liveness']
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_sample[features])

# Step 2: Define recommendation function
def recommend_songs(song_name, filter_by='mood'):
    recommendations = []
    
    try:
        song_index = df_sample[df_sample['track_name'].str.lower() == song_name.lower()].index[0]
    except IndexError:
        print("❌ Song not found in the dataset.")
        return []

    base_song = df_sample.iloc[song_index]
    similarity_scores = cosine_similarity([df_scaled[song_index]], df_scaled)[0]
    sorted_songs = sorted(list(enumerate(similarity_scores)), key=lambda x: x[1], reverse=True)

    base_genre = base_song['genre']
    mood_features = ['valence', 'energy', 'danceability']
    base_mood = base_song[mood_features].values

    for i in sorted_songs:
        candidate_song = df_sample.iloc[i[0]]

        # Skip same song
        if candidate_song['track_name'].lower() == song_name.lower():
            continue
        
        # (Optional) Skip same artist
        if candidate_song['artist_name'] == base_song['artist_name']:
            continue  # or apply priority logic

        if filter_by == 'genre':
            if candidate_song['genre'] == base_genre:
                recommendations.append(candidate_song['track_name'])

        elif filter_by == 'mood':
            candidate_mood = candidate_song[mood_features].values
            diff = np.linalg.norm(base_mood - candidate_mood)
            if diff < 0.3:
                recommendations.append(candidate_song['track_name'])

        if len(recommendations) == 5:
            break

    return recommendations

# Step 3: Use the function
recommended = recommend_songs('perfect', filter_by='mood')
print("🎵 Recommended Songs:")
for song in recommended:
    print("→", song)


🎵 Recommended Songs:
→ Stargazer
→ Call Your Bluff
→ Leverage
→ (Concerto for) Me and Myself - Remastered
→ My Wish - Remastered Version


✅ Check for Hindi songs by artist or title

In [14]:
# Lowercase all track names for easier searching
df_sample['track_name_lower'] = df_sample['track_name'].str.lower()

# Filter with common Hindi keywords
hindi_keywords = ['love', 'tera', 'dil', 'tum', 'yaar', 'mein', 'aankh', 'pyar', 'sanam', 'zindagi']

# Check for Hindi-sounding track names
hindi_songs = df_sample[df_sample['track_name_lower'].apply(lambda x: any(word in x for word in hindi_keywords))]

# Display the results
print(hindi_songs[['track_name', 'artist_name']].head(10))


                                       track_name            artist_name
38                               Bhakthiyudaiyaar  Sirkazhi Govindarajan
82                     It's Only Love - Radio Mix                Optical
114                              Universe of Love                DJ Shah
157                Everybody's Talking 'bout Love      Silver Convention
173  Love The One You're With (with James Taylor)  Crosby, Stills & Nash
187       Stumblin' In - The Distance & Igi Remix            Ahmet Kilic
206                       Shaker Love Song (Leah)            Josh Ritter
235                                    First Love                   Clon
250                                    Tough Love          The Rival Mob
263                            Tum Jo Mil Gaye Ho             Mika Singh


In [15]:
hindi_artists = ['Arijit Singh', 'Neha Kakkar', 'Badshah', 'Shreya Ghoshal', 'Atif Aslam']
hindi_songs_by_artist = df_sample[df_sample['artist_name'].isin(hindi_artists)]
print(hindi_songs_by_artist[['track_name', 'artist_name']].head(10))

                                         track_name     artist_name
1613                          Milne Hai Mujhse Aayi    Arijit Singh
2187                    Kanninima Neele - Version 1  Shreya Ghoshal
2430   Tujhe Kitna Chahne Lage (From "Kabir Singh")    Arijit Singh
3384                                  Dekhte Dekhte      Atif Aslam
3418                  Tera Hone Laga Hoon - Jhankar      Atif Aslam
3752                        Sudhu Bhalobasha (Solo)  Shreya Ghoshal
7431                                     Chhod Diya    Arijit Singh
7868                               Aai Jo Teri Yaad  Shreya Ghoshal
10100                                     Piya Bina    Arijit Singh
10259                              Dil Diyan Gallan      Atif Aslam


 To see all songs of an artist:

In [16]:
# Example: Arijit Singh ke songs dekhne ke liye
artist_name = 'Neha Kakkar'

# Filter karna dataset me
artist_songs = df_sample[df_sample['artist_name'].str.lower() == artist_name.lower()]

# Display only track names
print(f"🎤 Songs by {artist_name}:")
print(artist_songs['track_name'].unique())


🎤 Songs by Neha Kakkar:
[]


To Find Artist by Song Name

In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Load the dataset
df = pd.read_csv("spotify_data.csv")

# Step 1: Sample small dataset
df_sample = df.sample(n=3000, random_state=42)

# Step 2: Select features for similarity
features = ['danceability', 'energy', 'valence', 'tempo', 'popularity']
df_selected = df_sample[features].copy()

# Step 3: Scale features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_selected)

# Step 4: Fit Nearest Neighbors model
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(df_scaled)

# Step 5: Recommendation function (and artist lookup)
def recommend(song_name):
    df_sample_reset = df_sample.reset_index(drop=True)
    
    # Find the index of the input song
    index = df_sample_reset[df_sample_reset['track_name'].str.lower() == song_name.lower()].index

    if len(index) == 0:
        return f"❌ Song '{song_name}' not found in dataset."
    
    index = index[0]
    
    # Find 5 nearest neighbors
    distances, indices = model.kneighbors([df_scaled[index]], n_neighbors=6)  # 1 is the song itself
    recommendations = []

    for i in indices[0][1:]:  # Skip first (same song)
        track = df_sample_reset.iloc[i]['track_name']
        artist = df_sample_reset.iloc[i]['artist_name']
        recommendations.append(f"{track} by {artist}")

    return recommendations

# Step 6: Artist lookup (normal Jupyter output)
def get_artist(song_name):
    match = df[df['track_name'].str.lower() == song_name.lower()]
    if match.empty:
        return f"❌ Song '{song_name}' not found."
    return match[['track_name', 'artist_name']].drop_duplicates()


In [35]:
recommend("perfect")

['Te Deum H146 - Prélude en Rondeau by Marc-Antoine Charpentier',
 'Narkopop 7 by GAS',
 'The Lakes by James Vincent McMorrow',
 'In Grazia Del Tuo Nome, Con Amore by Alessandra Celletti',
 'Would You Love Me Anyway by Katrina Elam']

In [36]:
get_artist("pee loon")

Unnamed: 0,track_name,artist_name
87351,Pee Loon,Pritam
1101647,Pee Loon,Mohit Chauhan
1101899,Pee Loon,Tulsi Kumar
1141234,Pee Loon,Pinkoo Joseph
