#### Music Recommendation System

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('spotify_millsongdata.csv')
df.head()


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.shape

(57650, 4)

In [4]:
df1 = df.drop(columns=['link'])
df1.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [5]:
df1 = df1.drop_duplicates('song')
df1.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [6]:
# Checking Null values

df1.isnull().sum()

artist    0
song      0
text      0
dtype: int64

In [7]:
import re
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")



In [8]:
# Removing \n\r and stop words
def cleaning(text):
    
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)      

In [9]:
# Making a single feature

df1['Combined_features'] = df1['artist']+" "+df1['song']+" "+df1['text']

In [10]:
df1['song'] = df1['song'].str.lower()
df1['text'] = df1['text'].str.lower()

In [11]:
df1.head()

Unnamed: 0,artist,song,text,Combined_features
0,ABBA,ahe's my kind of girl,"look at her face, it's a wonderful face \r\na...","ABBA Ahe's My Kind Of Girl Look at her face, i..."
1,ABBA,"andante, andante","take it easy with me, please \r\ntouch me gen...","ABBA Andante, Andante Take it easy with me, pl..."
2,ABBA,as good as new,i'll never know why i had to go \r\nwhy i had...,ABBA As Good As New I'll never know why I had ...
3,ABBA,bang,making somebody happy is a question of give an...,ABBA Bang Making somebody happy is a question ...
4,ABBA,bang-a-boomerang,making somebody happy is a question of give an...,ABBA Bang-A-Boomerang Making somebody happy is...


In [12]:
# It takes lot of time to get feature vector so, selecting first 25k rows
df1 = df1[0:25000]

In [13]:
# It takes times atleast 20min

df1['Combined_features'] = df1['Combined_features'].apply(func = cleaning)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# creating a Tfidvectorizer object

tfv = TfidfVectorizer(max_features = 2000)

In [16]:
tfv_matrix = tfv.fit_transform(df1["Combined_features"])

In [17]:
# Cosine similarity matrix

cosine_sim = cosine_similarity(tfv_matrix)
cosine_sim

array([[1.        , 0.        , 0.00640513, ..., 0.00447332, 0.        ,
        0.00456683],
       [0.        , 1.        , 0.00607012, ..., 0.01240764, 0.0229859 ,
        0.04835664],
       [0.00640513, 0.00607012, 1.        , ..., 0.00112061, 0.06231748,
        0.01289604],
       ...,
       [0.00447332, 0.01240764, 0.00112061, ..., 1.        , 0.00818522,
        0.18233042],
       [0.        , 0.0229859 , 0.06231748, ..., 0.00818522, 1.        ,
        0.14831737],
       [0.00456683, 0.04835664, 0.01289604, ..., 0.18233042, 0.14831737,
        1.        ]])

In [18]:
song = "Hope"
song_index = df[df.song == song].index[0]
song_index

664

In [19]:
# cosine similarity

similar_songs = list(enumerate(cosine_sim[song_index]))
similar_songs

[(0, 0.0061772033625136745),
 (1, 0.13209406304505925),
 (2, 0.0126475266040242),
 (3, 0.014718166522160046),
 (4, 0.017776184199400403),
 (5, 0.00359104542489969),
 (6, 0.0),
 (7, 0.052988465590653726),
 (8, 0.012094383921023579),
 (9, 0.006408582553679823),
 (10, 0.04997333043669604),
 (11, 0.007538554286428268),
 (12, 0.0076670082456536375),
 (13, 0.0029452781427652874),
 (14, 0.028812939528617132),
 (15, 0.004063090635082504),
 (16, 0.007295173214335348),
 (17, 0.025029649559708073),
 (18, 0.015208224310062222),
 (19, 0.0),
 (20, 0.017309819071750485),
 (21, 0.04406963239712174),
 (22, 0.002521384729083265),
 (23, 0.002492223746745387),
 (24, 0.020074074871303714),
 (25, 0.004097556820904071),
 (26, 0.02420832034280515),
 (27, 0.004667409859022804),
 (28, 0.030051501297553608),
 (29, 0.012753867561399668),
 (30, 0.03920514283710164),
 (31, 0.0008169745598410016),
 (32, 0.015524831859731098),
 (33, 0.020765395523129344),
 (34, 0.012436350708054985),
 (35, 0.06100705279037718),
 (36,

In [20]:
# sorting based on cosine similarity

similar_song_sorted = sorted(similar_songs,key = lambda x:x[1],reverse = True)
similar_song_sorted

[(664, 1.0),
 (1426, 0.9834277834436801),
 (2130, 0.9802790252166576),
 (24442, 0.9560172911057757),
 (20813, 0.7270862259473571),
 (20135, 0.5118575680403202),
 (2719, 0.5001050226596988),
 (1169, 0.4806190249447993),
 (9756, 0.45814243529484366),
 (17286, 0.4359868338385177),
 (10079, 0.43081993792291373),
 (3185, 0.4132998891197683),
 (6589, 0.40796889461353275),
 (20348, 0.39792611757348023),
 (22741, 0.38927919295409996),
 (11863, 0.38792927300569185),
 (4538, 0.38025419941423877),
 (2862, 0.3767088354149266),
 (23421, 0.3709721858144592),
 (6426, 0.36249922526610767),
 (5548, 0.36100912116310496),
 (6600, 0.3570447649644668),
 (13801, 0.3496161898822223),
 (7273, 0.3457063703626374),
 (5240, 0.342999427076873),
 (6737, 0.34225377566101356),
 (4023, 0.3336404489955534),
 (19026, 0.333470198607974),
 (1838, 0.3326978793626119),
 (3409, 0.3283072225401218),
 (6969, 0.32731901435805727),
 (21996, 0.3239039208473887),
 (8672, 0.32168634681259284),
 (7724, 0.3202645753631281),
 (18098,

In [21]:
# Recommending 10 similar songs

def recommended_songs(song_user_likes):
    song_index = df[df.song == song_user_likes].index[0]
    similar_songs = list(enumerate(cosine_sim[song_index]))
    similar_song_sorted = sorted(similar_songs, key=lambda x:x[1], reverse=True)
    
    for song in similar_song_sorted[1:11]:
        similar_songs = df[df.index == song[0]]["song"].values[0]
        print(similar_songs)

In [22]:
recommended_songs("Cool")

The Time Of Day
The World Turned Upside Down
Again I Say Rejoice
Ho, Ho, Ho, And A Bottle Of Rum
Fine And Dandy
It'll Be A Long Time
Get Them Boyz
The Swan Song
Danny's Song
Slave To The Rhythm
