In [22]:
# Importing Libraries
import numpy as np
import pandas as pd

In [23]:
# Imported dataset
df= pd.read_csv("spotify_millsongdata.csv")

In [24]:
df

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...


In [25]:
df.shape

(57650, 4)

In [26]:
# Checking for Duplicacy
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [27]:
df= df.sample(5000).drop("link", axis=1).reset_index(drop=True)

# axis-1 means column and returns new dataframe
# reset_index(drop=True) : reset indexes and not to keep the old index

In [28]:
df.head()

Unnamed: 0,artist,song,text
0,Yngwie Malmsteen,End Of My Rope,Don't make a move \r\nDon't make a sound \r\...
1,John Mellencamp,Cheap Shot,Well the record company's goin' out of busines...
2,Jimmy Buffett,Door Number Three,"Oh I took a wrong turn, it was the right turn ..."
3,Helloween,Revelation,Now we chase the end of time \r\nAfraid what ...
4,Janis Joplin,One Night Stand (Alternate Take),When I'm on the road playin' in a town without...


In [29]:
df.text[0]

"Don't make a move  \r\nDon't make a sound  \r\nI've got my eyes trained on you  \r\nThe target is found  \r\n  \r\nYou waited too long  \r\nSoon you'll be gone  \r\nI'd rather stand on my own  \r\nAin't no more blanks left to fill  \r\nIn my list of sins  \r\nI'm at the end of my rope  \r\n  \r\nThe louder you speak  \r\nThe less I hear  \r\nI gotta let off some steam  \r\nand driven by fear  \r\n  \r\nTell me I'm wrong  \r\n'cause this can't go on  \r\nI'd rather stand on my own  \r\nAin't no more blanks left to fill  \r\nIn my list of sins  \r\nI'm at the end of my rope  \r\n  \r\nLeave me alone  \r\nThis can't go on  \r\nNot anymore, no  \r\n  \r\nYou waited too long  \r\nListen and soon you'll be gone  \r\nTell me I'm wrong  \r\n'Cause this can't go on\r\n\r\n"

Text Preprocessing

In [30]:
# df['text'].str.lower() - converts lowercase string
# replace [] : 
# df['text'].str.lower().replace(r'^\w\s' , ' ').replace(r'\n', ' ', regex= True )

In [31]:
df['text'] = (
    df['text']
      .str.lower()
      .str.replace(r'\n', ' ', regex=True)              # newlines → space
      .str.replace(r'[^\w\s]', '', regex=True)          # remove punctuation/non-word
      .str.replace(r'\s+', ' ', regex=True)             # collapse multiple spaces
      .str.strip()                                      # trim leading/trailing spaces
)
df.text

0       dont make a move dont make a sound ive got my ...
1       well the record companys goin out of business ...
2       oh i took a wrong turn it was the right turn m...
3       now we chase the end of time afraid what we wi...
4       when im on the road playin in a town without a...
                              ...                        
4995    pogues lorelei you told me tales of love and g...
4996    things have been goin wrong long enough to kno...
4997    its very clear our love is here to stay not fo...
4998    i tied my bandanatook my pack from the floor y...
4999    sleepless nights dont bother me at all if dawn...
Name: text, Length: 5000, dtype: object

TEXT VECTORIZATION

In [32]:
import nltk
from nltk.stem.porter import PorterStemmer

In [33]:
# Object
stemmer = PorterStemmer()

In [34]:
def token(txt):
    token = nltk.word_tokenize(txt)     #splits each word into token from text
    a= [stemmer.stem(w) for w in token]  #applies stem function to each token (running- run)
    return " ".join(a)                   #returns stemmed token and pass it in string again

In [35]:
token("You are Running")

'you are run'

In [36]:
# Applys tokenization on every text
df['text'].apply(lambda x: token(x))

0       dont make a move dont make a sound ive got my ...
1       well the record compani goin out of busi they ...
2       oh i took a wrong turn it wa the right turn my...
3       now we chase the end of time afraid what we wi...
4       when im on the road playin in a town without a...
                              ...                        
4995    pogu lorelei you told me tale of love and glor...
4996    thing have been goin wrong long enough to know...
4997    it veri clear our love is here to stay not for...
4998    i tie my bandanatook my pack from the floor yo...
4999    sleepless night dont bother me at all if dawn ...
Name: text, Length: 5000, dtype: object

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
tfid = TfidfVectorizer(analyzer ='word',stop_words='english')
# TfidfVectorizer: builds a matrix of TF-IDF features from text documents.
# TF-IDF = term frequency × inverse document frequency,  distinctive words get higher weight.
# analyzer='word': tokenizes on word-level (the default); each feature is a word (or n-gram if you change ngram_range).
# stop_words='english': removes common English stopwords like “the”, “and”, etc., before computing weights.

In [39]:
matrix=tfid.fit_transform(df['text'])


# fit: learns the vocabulary and IDF (inverse document frequency) weights from all strings in df['text'].

# transform: converts each document (each entry in df['text']) into a TF-IDF vector based on that learned vocabulary.

# The result, matrix, is a sparse matrix  (mostly entities are zero)

In [40]:
similar=cosine_similarity(matrix)
# it computes the pairwise cosine similarity between every pair of documents.

In [41]:
similar[0]

array([1.        , 0.02185349, 0.04141176, ..., 0.02719419, 0.00835448,
       0.03106356])

In [42]:
df[df["song"]=="Pumping Blood"].index[0]


4771

**Recommender Function**

In [43]:
def recommender(song_name):
    idx= df[df["song"]==song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), reverse=True , key= lambda x:x[1])
    song= [ ]
    for s_id in distance[1:21]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [44]:
recommender("Pumping Blood")

['Blood',
 "Oh My Son (I'm Sorry)",
 'Pumping',
 'Do It Again',
 'Ruination Of The Lost',
 'Little Jack Frost, Get Lost',
 'System',
 'From A Jack To A King',
 'Back In Your Face',
 'Coo-Coo',
 'My Blood',
 'Ordinary Day',
 'Freedom Song',
 'The Six Strings That Drew Blood',
 'Insect Pins',
 'Oceans',
 'Mr. Freeze',
 '1Stp Klosr',
 'Vicarious',
 'The Longest Day']

In [45]:
import pickle

In [46]:
pickle.dump(similar , open("similarity", "wb"))

In [47]:
pickle.dump(df, open("df", "wb"))