In [1]:
import pandas as pd

In [13]:
dataset = pd.read_csv("spotify_millsongdata.csv")

In [14]:
dataset.head(10)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,/a/abba/burning+my+bridges_20003011.html,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,/a/abba/cassandra_20002811.html,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,/a/abba/chiquitita_20002978.html,"Chiquitita, tell me what's wrong \r\nYou're e..."
8,ABBA,Crazy World,/a/abba/crazy+world_20003013.html,I was out with the morning sun \r\nCouldn't s...
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,I'm waitin' for you baby \r\nI'm sitting all ...


In [15]:
dataset.shape

(57650, 4)

In [16]:
dataset.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [27]:
dataset = dataset.drop('link', axis=1).reset_index(drop=True)

In [28]:
dataset['text'][0]

"I heard the stomping of feet dancing  \r\nOn the wooden floor upstairs  \r\nI wasn't in the mood for laughing  \r\nSo I sat silent in my chair  \r\nThere was someone missing I knew  \r\nOutside there fell the rain  \r\nWhere had she gone, what could I do  \r\nI played the waiting game  \r\nThe cigarette smoke was annoying  \r\nMy mood was fit for a bath  \r\nA drink couldn't oil my expression  \r\nNothing could've made me laugh  \r\nI was worried out of my head  \r\nI was in such a state  \r\nWhat's keeping her, where has she gone  \r\nI played the waiting game  \r\nWhen you love someone  \r\nYou worry when they're late  \r\nWhen you trust someone  \r\nYou know the time it takes  \r\nTo play the waiting game  \r\nThe music got louder and louder  \r\nFrom the wooden floor upstairs  \r\nI played with a handful of peanuts  \r\nThen I saw her standing there  \r\nMy mood leapt right out of the bath  \r\nShe had got stuck in the rain  \r\nHer coat dripped on a hanger  \r\nPlaying the waitin

In [32]:
dataset = dataset.sample(5000)

In [33]:
dataset.shape

(5000, 3)

In [34]:
dataset

Unnamed: 0,artist,song,text
1326,Louis Armstrong,Do You Know What Means To Miss New Orleans,Do you know what it means to miss New Orleans ...
2266,NOFX,Lazy,I wish I could go \r\nOff far away \r\nWhere...
2712,Cyndi Lauper,Hymn To Love,If the sky should fall into the sea \r\nAnd t...
3961,Guns N' Roses,Heartbreak Hotel,"Well, since my baby left me, \r\nI found a ne..."
2539,Otis Redding,Hey Hey Baby,"Hey, hey pretty baby \r\nBaby, you sure is fi..."
...,...,...,...
3953,Westlife,Have You Ever,[Chorus] \r\nHave you ever loved somebody so ...
2573,W.A.S.P.,Chainsaw Charlie (Murders In The New Morgue),O.K. boy now here's your deal \r\nWill you ga...
4418,Black Sabbath,Anno Mundi,"Can you see me, are you near me? \r\nCan you ..."
591,Hillsong,Better Than Life,Better than the riches of this world \r\nBett...


Text Cleaning

In [35]:
dataset['text'] = dataset['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [46]:
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [48]:
dataset['text'].head(10)

1326    do you know what it means to miss new orleans ...
2266    i wish i could go  \r off far away  \r where t...
2712    if the sky should fall into the sea  \r and th...
3961    well, since my baby left me,  \r i found a new...
2539    hey, hey pretty baby  \r baby, you sure is fin...
2743    you look at me and see the girl  \r who lives ...
145     rising with the sun, the work has been done  \...
3904    remember the days of the old schoolyard  \r   ...
2578    hi hello wake from thy sleep  \r god has given...
1086    i met a little girl sitting in the front  \r p...
Name: text, dtype: object

In [49]:
dataset['text'] = dataset['text'].apply(lambda x: tokenization(x))

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')

matrix = tfidvector.fit_transform(dataset['text'])

similarity = cosine_similarity(matrix)

In [52]:
similarity[0]

array([1.        , 0.01378021, 0.0404279 , ..., 0.03803842, 0.05677566,
       0.01162198])

In [53]:
dataset[dataset['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text
4782,UB40,Crying Over You,cri over you in the morn cri over you in the e...


In [54]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [55]:
recommendation('Crying Over You')

['Bang A Drum',
 'Roxie',
 'Freaky In The Club',
 'I Feel It',
 'Horseshoes and Handgrenades',
 "Ain't No Cure For Love (Crush Demo)",
 'Green Is The Colour',
 "Let's Keep It Between Us",
 'Shirtsleeves',
 'Getting Over You',
 'Something Beautiful',
 'The Trouble With Lovers',
 'Reaching For You',
 'Good Morning',
 'Kaddish',
 'Love Song',
 'Break The Spell',
 'Astonishing Panorama Of The Endtimes',
 'Same Direction',
 'Verge']

In [56]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(dataset,open('dataset.pkl','wb'))