In [1]:
# Importing python library
import pandas as pd
import numpy as np

In [2]:
# Importing Spotify dataset
df = pd.read_csv("../datasets/spotify_millsongdata.csv")

In [3]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df = df.sample(5000)

In [5]:
# Printing the Rows and Column
df.shape

(5000, 4)

In [6]:
# Printing all the null Value in each column
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
# Drop the link column from dataset
df = df.drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,artist,song,text
0,Lucky Dube,Man In The City,"Standing alone, in the middle of the city \r\..."
1,Lana Del Rey,Jump,Palm trees in black and white \r\nLast thing ...
2,Justin Bieber,Only Thing I Ever Get For Christmas,If you're the only thing I ever get for Christ...
3,Tim McGraw,I Know How To Love You Well,It's been a long time \r\nSince we walked dow...
4,Norah Jones,Flipside,"[Verse 1] \r\nI tried to get high, but you wa..."


In [9]:
# Printing Lyrics of first song
df['text'][0]

"Standing alone, in the middle of the city  \r\nMan looks around him  \r\nHe hardly recognizes what he sees  \r\nCause he' s a man from the past  \r\nTwo thousand years ago  \r\nThis was his home  \r\nRivers used to run here  \r\nBirds used to fly around here  \r\nBut now it is a different jungle  \r\nA concrete jungle  \r\n  \r\n[Chorus:]  \r\nHe' s a lonely man  \r\nIn the middle of the city  \r\nMissing home so badly  \r\n  \r\nMan' s own creation  \r\nHas become his worst enemy  \r\nWeapons of destruction  \r\nHave brought about  \r\nHuman extinction...wo...ho  \r\nTwo thousand years ago  \r\nThis was his home  \r\nBut now it is  \r\nA different jungle  \r\nA concrete jungle  \r\n  \r\n[Chorus:]  \r\nHe' s a lonely man  \r\nIn the middle of the city  \r\nMissing home so badly\r\n\r\n"

Data Preprocessing / Data Cleaning 

In [10]:
# df['text'].str.lower().replace(r'^a-ZA-Z0-9', '')
# df['text'] = df['text'].str.lower().replace(r'[^\w\s]', ' ').replace(r'\n', ' ', regex = True)
# df['text'] = df['text'].str.lower().replace(r'[^\w\s\n]|\\n', ' ', regex=True)

In [11]:
df['text'] = (
    df['text']
    .str.lower()
    .replace(r'[^\w\s]', ' ', regex=True)  # Remove non-alphanumeric characters except spaces
    .replace(r'\\[rnt]', ' ', regex=True) # Remove escape sequences like \r, \n, \t
    .replace(r'\s+', ' ', regex=True)     # Replace multiple spaces with a single space
    .str.strip()                          # Remove leading and trailing spaces
)

In [12]:
df['text']

0       standing alone in the middle of the city man l...
1       palm trees in black and white last thing i saw...
2       if you re the only thing i ever get for christ...
3       it s been a long time since we walked down tha...
4       verse 1 i tried to get high but you wanted me ...
                              ...                        
4995    your the devil in me that i brought in from th...
4996    the wind was howling dogs were sleeping i had ...
4997    ease up a little bit cause i want to want you ...
4998    here we come walkin down the street we get the...
4999    i was seventeen ou were working for matthew an...
Name: text, Length: 5000, dtype: object

In [13]:
!pip install nltk



In [14]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def token(text):
    token = nltk.word_tokenize(text)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

token("you are beautiful, beauty")

'you are beauti , beauti'

In [15]:
df['text'].apply(lambda x : token(x))

0       stand alon in the middl of the citi man look a...
1       palm tree in black and white last thing i saw ...
2       if you re the onli thing i ever get for christ...
3       it s been a long time sinc we walk down that a...
4       vers 1 i tri to get high but you want me low g...
                              ...                        
4995    your the devil in me that i brought in from th...
4996    the wind wa howl dog were sleep i had to bite ...
4997    eas up a littl bit caus i want to want you if ...
4998    here we come walkin down the street we get the...
4999    i wa seventeen ou were work for matthew and so...
Name: text, Length: 5000, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
tfid = TfidfVectorizer(analyzer='word', stop_words = 'english')

In [18]:
matrix = tfid.fit_transform(df['text'])

In [19]:
matrix.shape

(5000, 23038)

In [20]:
similarity = cosine_similarity(matrix)

In [21]:
similarity[0]

array([1.00000000e+00, 8.01874156e-03, 9.26375763e-04, ...,
       6.63447805e-03, 6.85122634e-03, 8.40570381e-03])

In [22]:
df.tail()

Unnamed: 0,artist,song,text
4995,Oasis,Setting Sun,your the devil in me that i brought in from th...
4996,Wishbone Ash,Kicks On The Street,the wind was howling dogs were sleeping i had ...
4997,Unwritten Law,The Celebration Song,ease up a little bit cause i want to want you ...
4998,The Monkees,Theme From The Monkees,here we come walkin down the street we get the...
4999,Cat Stevens,To Be A Star,i was seventeen ou were working for matthew an...


Recommendar Function

In [23]:
def recommendation(song_name):
    idx = df[df['song'] == song_name].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:11]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [25]:
recommendation("Setting Sun")

['Keep It Coming',
 'Unglued',
 'Believe (Club 69 Phunk Dub)',
 'Quiet Desperation',
 "I Can't Go On That Way",
 "Let's Go Up",
 "Don't Wait For Me",
 'Heaven',
 "Long Song Comin'",
 "One Time Comin'"]

In [26]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))