In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('spotify_millsongdata.csv')
df.head()


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.tail()

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [4]:
print("Shape: " , df.shape)

Shape:  (57650, 4)


In [5]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,artist,song,text
0,Journey,Out Of Harms Way,He grew up in some forgotten midwest town \r\...
1,Linda Ronstadt,I Keep It Hid,Deep down inside \r\nI know I still love him ...
2,Chris Rea,Sweet Summer Day,I want to see the blue sky \r\nI want to feel...
3,Beach Boys,Be True To Your School,When some loud braggart tries to put me down ...
4,Beautiful South,What You See Is What You Get,(The Dramatics) \r\n \r\nSome people are mad...


## Text Preprocessing

In [26]:
df['text']=df['text'].str.lower().replace(r'^\w\s', '').replace(r'\n', '',regex=True)

In [9]:
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\omsin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
stemmer = PorterStemmer()

In [11]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a = [stemmer.stem(word) for word in token]
    return " ".join(a)

# # example
# token("you are beautiful, beauty")

In [12]:
df['text'].apply(lambda x: token(x))

0       he grew up in some forgotten midwest town hi m...
1       deep down insid i know i still love him but he...
2       i want to see the blue sky i want to feel the ...
3       when some loud braggart tri to put me down and...
4       ( the dramat ) some peopl are made of plastic ...
                              ...                        
4995    heal myself , a feather on my heart look insid...
4996    it 's over your head and you do n't seem to un...
4997    rememb when never need each other the best of ...
4998    oh , he 's a rebel and you may not like hi loo...
4999    how are you ? you can start anytim got an appo...
Name: text, Length: 5000, dtype: object

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [15]:
matrix = tfid.fit_transform(df['text'])

In [16]:
similar = cosine_similarity(matrix)

In [17]:
similar[0]

array([1.        , 0.04414251, 0.03158656, ..., 0.01161729, 0.02037119,
       0.00510007], shape=(5000,))

## Recommender Function

In [21]:
def recommender(song_name):
    idx = df[df['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), key=lambda x: x[1], reverse=True)
    song=[]
    for s_id in distance[1:6]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [22]:
recommender('Out Of Harms Way')

['Stranger In A Strange Land',
 "Love Ain't No Stranger",
 'Brave New World',
 '2nd Thought',
 'Coping']

In [23]:
import pickle

In [25]:
pickle.dump(similar, open('similarity', 'wb'))
pickle.dump(df, open('df', 'wb'))