In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack

# how many recommendations per track?
N = 12

OUTPUT_CSV = 'recommendations.pkl'

In [2]:
music_info_df = pd.read_csv('Music_Info.csv')
music_info_df.head()


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [3]:
# 1) Count how many NaNs in each column
print(music_info_df.isna().sum())
# 3) Get the total number of NaNs in that column
print(music_info_df['tags'].isna().sum())

track_id                   0
name                       0
artist                     0
spotify_preview_url        0
spotify_id                 0
tags                       0
genre                  27640
year                       0
duration_ms                0
danceability               0
energy                     0
key                        0
loudness                   0
mode                       0
speechiness                0
acousticness               0
instrumentalness           0
liveness                   0
valence                    0
tempo                      0
time_signature             0
dtype: int64
0


In [4]:
music_info_df['tags_joined'] = (
    music_info_df['tags']
      .fillna('')
      .str.split(r'\s*,\s*')         # into lists
      .apply(lambda lst: ' '.join(lst))  # back into space-sep text
)

In [5]:
# TF–IDF on tags
tfidf = TfidfVectorizer(
    token_pattern=r'[^ ]+',  # tokens are the individual tags
    lowercase=True, 
)
X_text = tfidf.fit_transform(music_info_df['tags_joined'])
print("tags:", len(tfidf.vocabulary_.keys()),list(tfidf.vocabulary_.keys()))
print(f"Tags TF–IDF matrix: {X_text.shape}")


tags: 100 ['rock', 'alternative', 'indie', 'alternative_rock', 'indie_rock', '00s', 'pop', 'british', '90s', 'love', 'britpop', 'grunge', 'piano', 'beautiful', 'mellow', 'chillout', 'funk', 'electronic', 'dance', 'hip_hop', 'trip_hop', 'metal', 'hard_rock', 'nu_metal', 'progressive_rock', 'classic_rock', '80s', '70s', 'experimental', 'soundtrack', 'female_vocalists', 'heavy_metal', 'rap', '60s', 'post_punk', 'pop_rock', 'punk', 'punk_rock', 'acoustic', 'emo', 'new_wave', 'chill', 'indie_pop', 'gothic_metal', 'gothic', 'thrash_metal', 'folk', 'singer_songwriter', 'country', 'cover', 'blues_rock', 'psychedelic', 'psychedelic_rock', 'guitar', 'american', 'ska', 'ambient', 'blues', 'oldies', 'french', 'male_vocalists', 'reggae', 'instrumental', 'jazz', 'avant_garde', 'industrial', 'german', 'synthpop', 'hardcore', 'metalcore', 'screamo', 'post_rock', 'swedish', 'doom_metal', 'power_metal', 'rnb', 'soul', 'electro', 'house', 'techno', 'post_hardcore', 'progressive_metal', 'russian', 'noise'

In [6]:
X_text[0].toarray()  # TF–IDF for the first track

array([[0.52500743, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37211552, 0.42263951, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.3539553 ,
        0.        , 0.43291579, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [7]:
# scale numeric audio features
num_cols = [
    'danceability','energy','loudness',
    'speechiness','acousticness','instrumentalness',
    'liveness','valence','tempo'
]
X_num = music_info_df[num_cols].fillna(0).values
X_num = StandardScaler().fit_transform(X_num)
print(f"Numeric feature matrix: {X_num.shape}")


Numeric feature matrix: (49556, 9)


In [8]:
# combine text + numeric
# Since X_text is sparse, we hstack it with the dense X_num
from scipy.sparse import csr_matrix

X = hstack([X_text, csr_matrix(X_num)])
print(f"Combined feature matrix: {X.shape}")

Combined feature matrix: (49556, 109)


In [9]:
# — (replacement) — find top-N neighbors with cosine distance
from sklearn.neighbors import NearestNeighbors

# Note: we asked for N+1 because the closest neighbor of each point is itself (distance=0)
nbrs = NearestNeighbors(
    n_neighbors=N+1,       # self + N recommendations
    metric='cosine',
    algorithm='brute',      # brute-force works fine on sparse data
).fit(X)

distances, indices = nbrs.kneighbors(X)

In [10]:
# (replacement) — build the recommendations DataFrame
recs = []
for i, track in enumerate(music_info_df['spotify_id'].values):
    idxs = indices[i][1:]           # drop the first one (it's the track itself)
    sims = 1 - distances[i][1:]     # convert cosine distance → similarity
    recs.append({
        'spotify_id': track,
        **{f'rec_{k+1}': music_info_df['spotify_id'].iat[idxs[k]] for k in range(len(idxs))},
        **{f'score_{k+1}': sims[k] for k in range(len(sims))},
    })

recs_df = pd.DataFrame(recs)

In [11]:
recs_df.drop(columns=[col for col in recs_df.columns if col.startswith("score")], inplace=True)

In [12]:
rec_cols = [f"rec_{i}" for i in range(1, 13)]
temp_df = music_info_df[["spotify_id"]].copy()
temp_df["recommendations"] = recs_df[rec_cols].values.tolist()
recs_df = temp_df
recs_df.to_pickle(OUTPUT_CSV)

In [13]:
recommendations = pd.read_pickle("recommendations.pkl")
recommendations.head()

Unnamed: 0,spotify_id,recommendations
0,09ZQ5TmUG8TSL56n0knqrj,"[03ka8w3Z7H0JbE6BPYRjKD, 00745jzyTNmGl4cilzuMD..."
1,06UfBBDISthj1ZJAtX4xjj,"[3sER3wFKjJ0wgTHHUREyCe, 17AVdBSI2y0v95OC9t6Zx..."
2,0keNu0t0tqsWtExGM3nT1D,"[0i481eNk6i57yEltLytwPw, 0tHJqtKyFkLhh6lrMFycp..."
3,0ancVQ9wEcHVd0RrGICTE4,"[1MnQ853cih6Ph4JTQ3ohMc, 09j3qlROInjMlTt5ZHBYv..."
4,01QoK9DA7VTeTSE3MNzp4I,"[3PhZptfWAWXh52xRX9iZJp, 0u2JidstNdCwiU1WmneFy..."


In [14]:
to_drop = [col for col in music_info_df.columns if col not in ["spotify_id","name","artist","year","tags"]]
to_drop

['track_id',
 'spotify_preview_url',
 'genre',
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'tags_joined']

In [15]:
music_info_df.drop(columns=to_drop, inplace=True)

In [16]:
music_info_df.to_csv("Music_Info_trimmed.csv", index=False)

### test missing songs recoms

In [17]:
music_info_df.head(10)

Unnamed: 0,name,artist,spotify_id,tags,year
0,Mr. Brightside,The Killers,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",2004
1,Wonderwall,Oasis,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",2006
2,Come as You Are,Nirvana,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",1991
3,Take Me Out,Franz Ferdinand,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",2004
4,Creep,Radiohead,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",2008
5,Somebody Told Me,The Killers,0FNmIQ7u45Lhdn6RHhSLix,"rock, alternative, indie, pop, alternative_roc...",2005
6,Viva la Vida,Coldplay,08A1lZeyLMWH58DT6aYjnC,"rock, alternative, indie, pop, alternative_roc...",2013
7,Karma Police,Radiohead,01puceOqImrzSfKDAcd1Ia,"rock, alternative, indie, alternative_rock, in...",1996
8,The Scientist,Coldplay,0GSSsT9szp0rJkBrYkzy6s,"rock, alternative, indie, pop, alternative_roc...",2007
9,Clocks,Coldplay,0BCPKOYdS2jbQ8iyB56Zns,"rock, alternative, indie, pop, alternative_roc...",2002


In [19]:
%load_ext autoreload
%autoreload 2
import pipeline
query = {"spotify_id": "06UfBBDISthj1ZJAtX4xjj",   "tags":["rock", "british"]} # conditioned on tags 
recoms = pipeline._get_name_artist_pairs(pipeline.get_recommendations(input_data=query))
for recom in recoms:
    print(f"{recom} tags are: ",music_info_df[(music_info_df["artist"] == recom[1]) & (music_info_df["name"] == recom[0])]["tags"].tolist()[0])
    print()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
('Vince The Loveable Stoner', 'The Fratellis') tags are:  rock, alternative, indie, alternative_rock, indie_rock, british, britpop

("You Don't Love Me", 'The Kooks') tags are:  rock, alternative, indie, alternative_rock, indie_rock, british, britpop

('Fill My Little World', 'The Feeling') tags are:  indie, pop, british, indie_pop, britpop

('I Wanna Hold You', 'McFly') tags are:  pop, british, britpop, pop_rock

('Gone Up In Flames', 'Morning Runner') tags are:  rock, alternative, indie, britpop

('Fat Children', 'Jarvis Cocker') tags are:  rock, alternative, indie, british, britpop

('Everyday I Love You Less and Less', 'Kaiser Chiefs') tags are:  rock, alternative, indie, alternative_rock, indie_rock, british, britpop

('This House Is a Circus', 'Arctic Monkeys') tags are:  rock, alternative, indie, alternative_rock, indie_rock, british, britpop

('If You Wanna', 'The Vaccines') tags are:  rock,

### hist of tags

In [None]:
import plotly.express as px

# 1) explode your tags into one-per-row
tags = music_info_df['tags'].str.split(',').explode().str.strip()

# 2) count them, naming the index “tag” and the counts “frequency”
tag_counts = (
    tags
    .value_counts()                    # Series with tag as index, count as values
    .rename_axis('tag')                # name that index “tag”
    .reset_index(name='frequency')     # turn into DataFrame with columns ['tag','frequency']
)

# sanity check
print(tag_counts.columns)   # -> Index(['tag','frequency'], dtype='object')

# 3) plot
fig = px.bar(
    tag_counts,
    x='tag',
    y='frequency',
    title='Tag Frequency Histogram',
    labels={'tag':'Tag', 'frequency':'Count'}
)
fig.update_layout(
    xaxis_tickangle=-45,
    margin=dict(t=50, b=150)
)
fig.show()


Index(['tag', 'frequency'], dtype='object')


In [17]:
import pickle

# Convert to a set
tag_set = set(tag_counts["tag"])

# Save to pickle file
with open("all_tags.pkl", "wb") as f:
    pickle.dump(tag_set, f)


In [22]:
with open("all_tags.pkl","rb") as f:
    all_tags = pickle.load(f)
print(all_tags)

{'american', 'guitar', 'house', 'british', 'drum_and_bass', 'japanese', 'grunge', 'german', 'post_hardcore', 'gothic', 'country', 'pop', 'oldies', 'nu_metal', 'chillout', 'singer_songwriter', 'death_metal', 'trance', 'russian', 'avant_garde', '00s', 'idm', 'punk_rock', 'alternative', 'rock', 'hip_hop', 'synthpop', 'instrumental', 'mellow', 'black_metal', 'piano', '70s', 'noise', 'soundtrack', 'metalcore', 'ambient', 'indie_pop', 'love', 'chill', 'doom_metal', 'blues_rock', 'female_vocalists', 'cover', 'gothic_metal', 'grindcore', 'industrial', 'psychedelic', 'metal', 'rap', 'screamo', 'polish', 'electro', 'dance', '90s', 'new_age', 'experimental', '80s', 'lounge', 'post_punk', 'soul', 'emo', 'melodic_death_metal', 'trip_hop', 'progressive_metal', 'classic_rock', 'techno', 'progressive_rock', 'swedish', 'hard_rock', 'acoustic', 'electronic', 'heavy_metal', 'reggae', 'power_metal', 'post_rock', 'french', 'blues', 'pop_rock', 'folk', 'funk', 'jazz', 'punk', 'symphonic_metal', 'thrash_meta