In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Halloween,Trick Or Treat,By Geof Johnson \r\n \r\nThe very best part ...
1,Bonnie Raitt,Can't Get Enough,Come on so tough \r\nWhen you think you got a...
2,Elton John,Cottonfields,When I was a little bitty baby \r\nMy Momma d...
3,Culture Club,Stormkeeper,Ooh I have love sweeter than lies \r\nGave yo...
4,Alice Cooper,Halo Of Flies,I've got the answers \r\nTo all of your quest...
5,Billie Holiday,Body And Soul,My days have grown so lonely \r\nFor you I cr...
6,Kenny Chesney,Keg In The Closet,We had a dog named Bocephus living in the fron...
7,Bread,Fly Away,Now is the day \r\nWe're on our way \r\nLeav...
8,Horrible Histories,Blue Bloody Blues,I bet you think the Stuarts were an English dy...
9,Who,Real Good Looking Boy,When I think back to the \r\nfirst time in my...


In [9]:
df['text'][0]

"By Geof Johnson  \r\n  \r\nThe very best part of Halloween  \r\nIs getting more candy than you've ever seen.  \r\nI've got my scariest costume on.  \r\nI scare myself when I'm all alone.  \r\n  \r\nSun goes down, I'm all dressed up,  \r\nI set out to try my luck.  \r\nWalkin' up and down the street,  \r\nI knock on the door and say...  \r\n  \r\nTrick or treat, trick or treat, Give me something good to eat.  \r\nTrick or treat, trick or treat, Give me something good to eat.  \r\n  \r\nThe very first house is big and dark,...  \r\n  \r\nTrick or treat, trick or treat, Give me something good to eat.  \r\nTrick or treat, trick or treat, Give me something good to eat.  \r\n  \r\nThe very next house is big and white,...  \r\n  \r\nTrick or treat, trick or treat, Give me something good to eat.  \r\nTrick or treat, trick or treat, Give me something good to eat.  \r\n  \r\nIf you go out Halloween night,  \r\nI'll give you a tip that'll make it all right.  \r\nSmile at everyone that you meet. 

In [10]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [11]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [12]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [13]:
print(tokenization('you are beautiful, beauty'))

you are beauti , beauti


In [14]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [17]:
similarity[0]

array([1.        , 0.02532215, 0.01452606, ..., 0.01360091, 0.00181694,
       0.00343312])

In [18]:
matching_rows = df[df['song'] == 'Fly Away']

if not matching_rows.empty:
    idx = matching_rows.index[0]
    print(f"Index of the song: {idx}")
else:
    print("Song not found in the dataset")

Index of the song: 7


In [21]:
def recommendation(song_name):
    idx = df[df['song'] == song_name].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for s_id in distances[1:21]:
        songs.append(df.iloc[s_id[0]].song)
        
    return songs

In [22]:
recommendation('Fly Away')

["I'll Fly Away",
 "Fly Don't Fly On Me",
 'Time Flies',
 'Fly',
 'I Believe I Can Fly',
 'My! My! Time Flies!',
 'We Were Born To Fly',
 'I Believe I Can Fly',
 'Flying To My Home',
 'Ready To Fly',
 "She's Got Her Ticket",
 'Aces High',
 'Fly solo',
 'Wind Beneath My Wings',
 'All Over Again',
 'Never Got Off The Ground',
 'I Could Fly',
 'Little Bird',
 'One Day',
 'Fit To Fly']

In [23]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))