In [42]:
import pandas as pd

In [43]:
df = pd.read_csv("spotify_millsongdata.csv")

In [44]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [45]:
df.shape

(57650, 4)

In [46]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [49]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [50]:
df.head(5)

Unnamed: 0,artist,song,text
0,Adele,Send My Love (To Your New Lover),"[Intro] \r\nJust the guitar? \r\nOkay, cool ..."
1,Everclear,A Beautiful Life,You seem to make the same mistakes everyday \...
2,Who,Goin' Mobile,I'm goin' home \r\nAnd when I want to go home...
3,Metallica,No Leaf Clover,And it feels right this time \r\nOn this cras...
4,Keith Urban,Where The Blacktop Ends,Gonna kick off my shoes \r\nAnd run in bare f...


In [51]:
df['text'][0]

"[Intro]  \r\nJust the guitar?  \r\nOkay, cool  \r\n  \r\n[Verse 1]  \r\nThis was all you, none of it me  \r\nYou put your hands on on my body and told me  \r\nTold me you were ready  \r\nFor the big one, for the big jump  \r\nI'd be your last love everlasting, you and me  \r\nMmm, that was what you told me  \r\n  \r\n[Pre-Chorus]  \r\nI'm giving you up  \r\nI've forgiven it all  \r\nYou set me free, oh  \r\n  \r\n[Chorus]  \r\nSend my love to your new lover  \r\nTreat her better  \r\nWe've gotta let go of all of our ghosts  \r\nWe both know we ain't kids no more  \r\nSend my love to your new lover  \r\nTreat her better  \r\nWe've gotta let go of all of our ghosts  \r\nWe both know we ain't kids no more  \r\n  \r\n[Verse 2]  \r\nI was too strong, you were trembling  \r\nYou couldn't handle the hot heat rising (rising)  \r\nMmm, baby I'm so rising  \r\nI was running, you were walking  \r\nYou couldn't keep up, you were falling down (down)  \r\nMmm, there's only one way down  \r\n  \r\n[

In [52]:
df.tail(5)

Unnamed: 0,artist,song,text
4995,America,Garden Of Peace,I look inside at the scarlet room \r\nPlaces ...
4996,W.A.S.P.,Chainsaw Charlie (Murders In The New Morgue),O.K. boy now here's your deal \r\nWill you ga...
4997,Rihanna,Happy,Just as long it makes you happy \r\nIf it mak...
4998,Electric Light Orchestra,Is It Alright,Dear Jo: \r\nI don't know where to begin so I...
4999,Vanessa Williams,Everlasting Love,When I feel you dreaming \r\nI think of sunse...


# text cleaning

In [53]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [54]:
df.tail(5)

Unnamed: 0,artist,song,text
4995,America,Garden Of Peace,i look inside at the scarlet room \r places t...
4996,W.A.S.P.,Chainsaw Charlie (Murders In The New Morgue),o.k. boy now here's your deal \r will you gam...
4997,Rihanna,Happy,just as long it makes you happy \r if it make...
4998,Electric Light Orchestra,Is It Alright,dear jo: \r i don't know where to begin so i'...
4999,Vanessa Williams,Everlasting Love,when i feel you dreaming \r i think of sunset...


In [55]:
import nltk
from nltk.stem.porter import PorterStemmer

In [56]:
stemmer=PorterStemmer()

In [57]:
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [58]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [61]:
matrix

<5000x17224 sparse matrix of type '<class 'numpy.float64'>'
	with 274763 stored elements in Compressed Sparse Row format>

In [62]:
similarity[0]

array([1.        , 0.01203163, 0.00269952, ..., 0.01290174, 0.04753666,
       0.07146171])

In [63]:

df[df['song'] == 'Harvest'].index[0]

3758

In [64]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [67]:
recommendation('Is It Alright')

['Alright',
 'Refuge',
 'Any Way You Want It',
 'Wreck Of The Hesperus',
 "I'll Be Alright",
 'Above The Clouds',
 "I'm All Right",
 'I Got A Woman',
 'Get Back',
 'Come With Me',
 'Let It Die',
 "She Don't Want Nobody Near",
 'Going To A Go-Go',
 "Baby It's Allright",
 'Cailin',
 'Here Comes The Night',
 'Lady Stardust',
 'Three Little Birds',
 'I Wanna Be With You',
 'Hold On']

In [68]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))