In [1]:
import pandas as pd
df = pd.read_csv("spotify_millsongdata.csv")

In [2]:
#Checking for null values
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [3]:
#Dropping the link cloumn
df = df.drop('link',axis=1).reset_index(drop=True)

Text Cleaning

In [4]:
#Translating the lyrics column(text) to lower case
df['text'] = df['text'].str.lower().replace(r'a-ZA-Z0-9',' ').replace(r'\n',' ',regex = True).replace(r'\r',' ',regex = True)

In [5]:
# Importing nltk for stemming
import nltk
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def token(txt):
    tokens = nltk.word_tokenize(txt)
    stemmed_tokens = [stemmer.stem(s) for s in tokens]
    return " ".join(stemmed_tokens)

In [6]:
#Evaluating the function with an example
token("you are beautiful my beauty")

'you are beauti my beauti'

In [7]:
#Applying the function to df
df['text'].apply(lambda x :token(x))

0        look at her face , it 's a wonder face and it ...
1        take it easi with me , pleas touch me gentli l...
2        i 'll never know whi i had to go whi i had to ...
3        make somebodi happi is a question of give and ...
4        make somebodi happi is a question of give and ...
                               ...                        
57645    iri day come on play let the angel fli let the...
57646    power to the worker more power power to the wo...
57647    all you need is someth i 'll believ flashlight...
57648    northern star am i frighten where can i go to ...
57649    come in make yourself at home i 'm a bit late ...
Name: text, Length: 57650, dtype: object

In [8]:
#Vectorizing and finding Similarity using Cosine_similarity

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(analyzer = 'word', stop_words = 'english')

matrix = tfid.fit_transform(df['text'].head(30000))
array = cosine_similarity(matrix)

print(array)

[[1.         0.0014535  0.00991734 ... 0.0209551  0.05519017 0.01556152]
 [0.0014535  1.         0.00398359 ... 0.00253139 0.00882854 0.01808771]
 [0.00991734 0.00398359 1.         ... 0.07264914 0.00582049 0.01540913]
 ...
 [0.0209551  0.00253139 0.07264914 ... 1.         0.0022271  0.0089489 ]
 [0.05519017 0.00882854 0.00582049 ... 0.0022271  1.         0.06130133]
 [0.01556152 0.01808771 0.01540913 ... 0.0089489  0.06130133 1.        ]]


Recommender Functions

In [9]:
#Song Recommender function
def song_recommender(song_name):
    song_id = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(array[song_id])), reverse=True, key = lambda x:x[1])
    recommended_song = []
    for s_id in distance [1:6]:
        recommended_song.append(df.iloc[s_id[0]].song)
    return recommended_song

In [10]:
#Evaluating the Song Recommender function
song_recommender("Bang")

['Sea Of Dreams',
 'The Prime Of Your Love',
 'Bang-A-Boomerang',
 'Dum Dum',
 'Give Me A Bit']

In [11]:
#Songs from the same artist recommender
def artist_recommender(artist_name):
    artist_df = df[df['artist']== artist_name].head(5)
    songs_by_artist = artist_df['song'].tolist()
    return songs_by_artist

In [12]:
#Evaluating the Artist Recommender function
artist_recommender("Bryan White")

['A Hundred And One',
 'Between Now And Forever',
 'Everywhere I Turn',
 'How Lucky I Am',
 'I Stand All Alone']

Model Saving

In [13]:
#Saving the model
import joblib
joblib.dump(array, "similarity.joblib")
joblib.dump(df, "df.joblib")

['df.joblib']