In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Wishbone Ash,Say Goodbye,I can tell that you don't feel good \r\nI gue...
1,Michael Jackson,Say Say Say,"Say, say, say what you want but don't play gam..."
2,Bob Seger,Ramblin' Gamblin' Man,"Yeah, gonna tell my tale \r\nCome on, \r\nCo..."
3,Hollies,Baby That's All,The night turns to dawn \r\nYou're there by m...
4,Fabolous,The Get Back,[Intro:] \r\nWho wanna bet us that they can't...
5,Cliff Richard,High Class Baby,"Well you can't be my lovin' baby, you ain't go..."
6,Everclear,Local God,"You do that Romeo, \r\nBe what you want to be..."
7,Cher,Holdin' Out For Love,Had my share of disco nights \r\nDid some tal...
8,Emmylou Harris,Just Someone I Used To Know,There's a picture that I carry \r\nOne we mad...
9,Oingo Boingo,Change,"Don't you ever wonder why, nothing ever seems ..."


In [9]:
df['text'][0]

"I can tell that you don't feel good  \r\nI guess you're not alone.  \r\nNow that you packed your things  \r\nAnd you're ready to go  \r\nLet's do something tonight I don't know.  \r\nWe can take a ride across town  \r\nCome on, baby, get your clothes on.  \r\nAll night long, we'll be so high  \r\nThat's the way we'll say goodbye.  \r\nWhat's the name of the old cafe  \r\nWhere we used to go?  \r\nIt could be good to go there now  \r\nBut I don't want to be unhappy today.  \r\nWe can take a ride across town  \r\nCome on, baby, get your clothes on.  \r\nAll night long, we'll be so high  \r\nThat's the way we'll say goodbye.  \r\n\r\n"

In [10]:
# df = df.sample(5000)

In [11]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [12]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [13]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [14]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [17]:
similarity[0]

array([1.        , 0.06929294, 0.0245787 , ..., 0.06585866, 0.03682895,
       0.0122349 ])

In [18]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text


In [19]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [21]:
recommendation('Say Goodbye')

["I Don't Know How To Say Goodbye",
 'Goodbye Highway',
 'She Called Me Baby',
 "Baby Don't You Know?",
 'Men Are All The Same',
 'Another Way To Say Goodbye',
 "I'm Your Man",
 'Come On In',
 'High On You Mama',
 'This Old Road',
 'Make It Right',
 'I Wanna Come Over',
 'Slow Ride',
 "It's Just The Way",
 'All The Way',
 'You And Me And One Spotlight',
 'Change My Mind',
 "I'm Your Baby Tonight",
 "It's Alright",
 'The Boys Are Back In Town']

In [22]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))