In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [37]:
df = pd.read_csv('Spotify_history.csv', encoding='latin-1')
df.head()

Unnamed: 0,Column 1,Timestamp,date,time,artist_name,track_name,album_name,ms_played,platform,shuffle,...,reason_end,hour,day_of_week,hour_day,seconds_played,popularity,release_year,popularity_bin,listening_time,weekday
0,0,07-08-2013 02:44,07-08-2013,02:44:00,The Mowgli's,"Say It, Just Say It",Waiting For The Dawn,3185,web player,False,...,clickrow,2,2,02:44,3.185,52,2021,51-75,3.185,1
1,1,07-08-2013 02:45,07-08-2013,02:45:00,Calvin Harris,Drinking from the Bottle (feat. Tinie Tempah),18 Months,61865,web player,False,...,clickrow,2,2,02:45,61.865,93,2022,76-100,61.865,1
2,2,07-08-2013 02:50,07-08-2013,02:50:00,Lana Del Rey,Born To Die,Born To Die - The Paradise Edition,285386,web player,False,...,unknown,2,2,02:50,285.386,15,2022,0-25,285.386,1
3,3,07-08-2013 02:52,07-08-2013,02:52:00,Lana Del Rey,Off To The Races,Born To Die - The Paradise Edition,134022,web player,False,...,clickrow,2,2,02:52,134.022,72,2011,51-75,134.022,1
4,4,07-08-2013 03:17,07-08-2013,03:17:00,Empire Of The Sun,Half Mast,Walking On A Dream,0,web player,False,...,nextbtn,3,2,03:17,0.0,61,2005,51-75,0.0,1


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147866 entries, 0 to 147865
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Column 1        147866 non-null  int64  
 1   Timestamp       147866 non-null  object 
 2   date            147866 non-null  object 
 3   time            147866 non-null  object 
 4   artist_name     147866 non-null  object 
 5   track_name      147866 non-null  object 
 6   album_name      147866 non-null  object 
 7   ms_played       147866 non-null  int64  
 8   platform        147866 non-null  object 
 9   shuffle         147866 non-null  bool   
 10  skipped         147866 non-null  int64  
 11  reason_start    147866 non-null  object 
 12  reason_end      147866 non-null  object 
 13  hour            147866 non-null  int64  
 14  day_of_week     147866 non-null  int64  
 15  hour_day        147866 non-null  object 
 16  seconds_played  147866 non-null  float64
 17  popularity

In [39]:
df.shape

(147866, 22)

In [40]:
df.drop(columns= 'Column 1', inplace=True)

In [41]:
df.head(5)

Unnamed: 0,Timestamp,date,time,artist_name,track_name,album_name,ms_played,platform,shuffle,skipped,...,reason_end,hour,day_of_week,hour_day,seconds_played,popularity,release_year,popularity_bin,listening_time,weekday
0,07-08-2013 02:44,07-08-2013,02:44:00,The Mowgli's,"Say It, Just Say It",Waiting For The Dawn,3185,web player,False,0,...,clickrow,2,2,02:44,3.185,52,2021,51-75,3.185,1
1,07-08-2013 02:45,07-08-2013,02:45:00,Calvin Harris,Drinking from the Bottle (feat. Tinie Tempah),18 Months,61865,web player,False,0,...,clickrow,2,2,02:45,61.865,93,2022,76-100,61.865,1
2,07-08-2013 02:50,07-08-2013,02:50:00,Lana Del Rey,Born To Die,Born To Die - The Paradise Edition,285386,web player,False,0,...,unknown,2,2,02:50,285.386,15,2022,0-25,285.386,1
3,07-08-2013 02:52,07-08-2013,02:52:00,Lana Del Rey,Off To The Races,Born To Die - The Paradise Edition,134022,web player,False,0,...,clickrow,2,2,02:52,134.022,72,2011,51-75,134.022,1
4,07-08-2013 03:17,07-08-2013,03:17:00,Empire Of The Sun,Half Mast,Walking On A Dream,0,web player,False,0,...,nextbtn,3,2,03:17,0.0,61,2005,51-75,0.0,1


In [42]:
meaningful_listens_df = df[df['ms_played'] >= 60000].copy()

song_features = meaningful_listens_df.groupby(['artist_name', 'track_name']).agg(
    avg_ms_played=('ms_played', 'mean'),
    listen_count=('ms_played', 'size')
).reset_index()

track_metadata = df[['artist_name', 'track_name', 'popularity', 'release_year']].drop_duplicates(
    subset=['artist_name', 'track_name'], keep='last')


In [43]:
model_df = pd.merge(song_features, track_metadata, on=['artist_name', 'track_name'], how='left')
model_df.dropna(inplace=True)
model_df.reset_index(drop=True, inplace=True)
model_df.head()

Unnamed: 0,artist_name,track_name,avg_ms_played,listen_count,popularity,release_year
0,"""Weird Al"" Yankovic",Bob,65300.0,1,41,2013
1,*NSYNC,Bye Bye Bye - From Deadpool and Wolverine Soun...,200389.0,1,66,2008
2,*NSYNC,"Merry Christmas, Happy Holidays",255306.0,1,41,2016
3,.Sinh,Coastal Walks,82894.0,1,26,2021
4,070 Shake,Guilty Conscience - Tame Impala Remix,214971.0,2,40,2019


In [44]:
FEATURE_COLS = ['avg_ms_played', 'popularity', 'release_year', 'listen_count']
X = model_df[FEATURE_COLS]

In [45]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

### **Spotify Recommendations by `track_name`**

In [46]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(X_scaled)
indices = pd.Series(model_df.index, index=model_df['track_name']).drop_duplicates()

In [47]:
def get_recommendations(title, N=10, data = model_df, cosine_sim=cosine_sim, indices=indices):
    if title not in indices.index:
        print(f"❌ Error: Song '{title}' not found in the meaningful listen history.")
        return pd.DataFrame()

    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:N+1]
    song_indices = [i[0] for i in sim_scores]

    recommendations = data.iloc[song_indices].copy()
    recommendations['Similarity_Score'] = [i[1] for i in sim_scores]

    return recommendations[['artist_name', 'track_name', 'Similarity_Score', 'popularity', 'release_year']]

In [48]:
new_recommendations = get_recommendations('Merry Christmas, Happy Holidays', N=10)
new_recommendations = new_recommendations.sort_values(by='popularity', ascending=False) 
new_recommendations

Unnamed: 0,artist_name,track_name,Similarity_Score,popularity,release_year
7430,No Te Va Gustar,Chau,0.999908,58,2023
11024,The War On Drugs,Red Eyes,0.999897,52,2020
2186,DJ Kelvin El Sacamostro,Candy Perreo,0.999934,47,2018
3613,Gioachino Rossini,Il barbiere di Siviglia (The Barber of Seville...,1.0,46,2018
928,Bee Gees,You Should Be Dancing,0.999948,43,2017
5582,Kevin Johansen,Modern Love,0.999937,40,2016
6856,Michael BublÃ©,Have Yourself a Merry Little Christmas,0.999901,39,2015
7056,Morat,CuÃ¡nto Me Duele,0.999899,39,2015
9208,SofÃ­a Reyes,Mal de Amores,0.999933,29,2011
9027,Sheppard,Let Me Down Easy,0.999926,16,2006


### **Spotify Recommendations by `artist_name` & `track_name`**

In [49]:
model_df['combined_key'] = model_df['artist_name'] + ' - ' + model_df['track_name']
indices = pd.Series(model_df.index, index=model_df['combined_key']).drop_duplicates()

In [50]:
def get_recommendations_combined(artist, title, N=10, data=model_df, cosine_sim=cosine_sim, indices=indices):
    search_key = artist + ' - ' + title
    if search_key not in indices.index:
        print(f"❌ Error: Song '{title}' by '{artist}' not found in the meaningful listen history.")
        return pd.DataFrame()

    idx = indices[search_key]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:N+1]
    song_indices = [i[0] for i in sim_scores]

    recommendations = data.iloc[song_indices].copy()
    recommendations['Similarity_Score'] = [i[1] for i in sim_scores]

    return recommendations[['artist_name', 'track_name', 'Similarity_Score', 'popularity', 'release_year']]


In [52]:
recommendations_df = get_recommendations_combined('Lana Del Rey', 'Born To Die', N=5)
recommendations_df = recommendations_df.sort_values(by = 'popularity', ascending = False)
recommendations_df

Unnamed: 0,artist_name,track_name,Similarity_Score,popularity,release_year
5394,Kanye West,Breathe In Breathe Out,0.999894,79,2011
6799,Maverick City Music,Kingdom (feat. Naomi Raine & Chandler Moore),0.999904,71,2010
9353,TEEKS,Never Be Apart,0.999912,59,2008
8458,Ramin Djawadi,Khaleesi,0.999903,52,2007
5763,La Arrolladora Banda El LimÃ³n De Rene Camacho,Llamada De Mi Ex,0.999898,52,2007
