In [None]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer #text->token->vector=>tokenization
import pickle

In [None]:
df = pd.read_csv("spotify_millsongdata.csv")

In [None]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [None]:
df.shape

(5000, 4)

In [None]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [None]:
df=df.sample(5000).drop('link',axis=1).reset_index(drop=True)

text cleaning/preprocessing

In [None]:
df['text']=df['text'].str.lower().replace(r'^\w\s',' ').replace(r'\n',' ',regex = True)

In [None]:
stemmer=PorterStemmer()#used to categorize words with different spelling but same pronunciation

In [None]:
def token(txt):
    token=nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in token]#storing the tokens in a list
    return " ".join(a)



In [None]:
token("u r beauti ,beatuy,beauty")

'u r beauti , beatuy , beauti'

In [None]:
df['text'].apply(lambda x : token(x))

5116     i hear a call now will answer forsak my all to...
16715    babi now i realiz all of those time i told you...
2580     you 're struttin ' into town like you 're slin...
2128     well your cd collect look shini and costli . h...
54481    i heard you were concern with my life i heard ...
                               ...                        
37489    oh , the last time that i saw you you know you...
36934    3 ring `` is thi on ? '' `` gather round my we...
56328    went to the fortun teller to have my fortun re...
24121    god bless the day i found you i want to stay a...
24951    gaston you 've been dream , just one dream nea...
Name: text, Length: 5000, dtype: object

vectorization


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
tfid = TfidfVectorizer(analyzer = 'word',stop_words = 'english')

In [None]:
matrix = tfid.fit_transform(df['text']) #sparse matrix

In [None]:
similar = cosine_similarity(matrix)#used to get distances of all data points(angular dist(theta))

In [None]:
similar[0]

array([1.        , 0.01929735, 0.00121253, ..., 0.01322721, 0.03279443,
       0.00257457])

Recommender function

We input song lyrics in this function to get song data including song name and we get the index of that song(desired data point) and then we apply the function to get the distance of that data point

In [None]:
def recommend(song_name):
    idx=df[df['song'] == song_name].index[0]
    dist = sorted(list(enumerate(similar[idx])),reverse= True,key = lambda x:x[1])
    song = []
    for s_id in dist[1:21]:
        song.append(df.iloc[s_id[0]].song)
    return song
    

In [None]:
recommend("Farewell Song")

["A Winter's Tale",
 'First Train Home',
 "It's Not Unusual",
 'Houston Is Hot Tonight',
 'Up On The Roof',
 'Rubber Lucy',
 'My Love Belongs To You',
 'Killer',
 'J-Bieber Rap',
 'Heart With No Companion',
 'Prayer Of The Children',
 'Mona Lisa',
 'Freewheel Burning',
 "I've Got A Date With A Dream",
 'Singer Man',
 'Dance Ballerina Dance',
 'Nothingtown',
 'Whenever I Speak His Name',
 'John Wayne',
 'Daddy']

In [None]:
pickle.dump(similar,open("similarity","wb"))

In [None]:
pickle.dump(df,open("Df","wb"))