In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

In [34]:
data = pd.read_csv('spotify_millsongdata.csv')
small_data = data.sample(n=10000, random_state=42).reset_index(drop=True)
print(small_data.head())

         artist                       song  \
0  Wishbone Ash             Right Or Wrong   
1     Aerosmith  This Little Light Of Mine   
2  Fall Out Boy               Dance, Dance   
3  Janis Joplin                 Easy Rider   
4   Moody Blues                  Peak Hour   

                                                link  \
0       /w/wishbone+ash/right+or+wrong_20147150.html   
1  /a/aerosmith/this+little+light+of+mine_2064448...   
2          /f/fall+out+boy/dance+dance_10113666.html   
3           /j/janis+joplin/easy+rider_10147381.html   
4             /m/moody+blues/peak+hour_20291295.html   

                                                text  
0  Like to have you 'round  \r\nWith all the lies...  
1  This Little Light of Mine (Light of Mine),  \r...  
2  She says she's no good with words but I'm wors...  
3  Hey mama, mama, come a look at sister,  \r\nSh...  
4  I see it all through my window it seems.  \r\n...  


In [35]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(small_data['text'])
print(tfidf_matrix.shape)

(10000, 34172)


In [36]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

(10000, 10000)


In [37]:
def get_recommendations(song_title, tfidf_matrix, small_data):
    if song_title not in small_data['song'].values:
        return f"'{song_title}' not found in the dataset."
    idx = small_data.index[small_data['song'] == song_title].tolist()[0]
    sim_scores = linear_kernel(tfidf_matrix[idx], tfidf_matrix)
    sim_scores = list(enumerate(sim_scores[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    song_indices = [i[0] for i in sim_scores[1:6]]
    return small_data['song'].iloc[song_indices]

In [39]:
recommended_songs = get_recommendations('Right Or Wrong', tfidf_matrix, small_data)
print(recommended_songs)

6008         Wrong Right Wrong
4374            Nobody's Wrong
3708    I'm A Fool To Want You
8573     Where Did We Go Wrong
5037                 Wrong Way
Name: song, dtype: object
