# Data Formatting Tasks for Database Creation

In [117]:
import pandas as pd

# Spotify Million Song Dataset - https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset
df = pd.read_csv("spotify_millsongdata.csv")

df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,She's My Kind Of Girl,/a/abba/shes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [118]:
df.shape

(57650, 4)

In [119]:
df.isna().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [120]:
# remove unused column
df.drop(columns=['link'], inplace=True)

# limit to only a subset of 100 songs
# df = df.sample(n=1000)

df.head()

Unnamed: 0,artist,song,text
0,ABBA,She's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [121]:
# clean the text

df['text'] = df['text'].str.lower()
# remove carriage return (\r) and newline (\n) characters
df['text'] = df['text'].replace(r'\r|\n', '', regex=True)
# remove punctuation
df['text'] = df['text'].replace(r'[^\w\s]', '', regex=True) 

df['text'].iloc[0]

'look at her face its a wonderful face  and it means something special to me  look at the way that she smiles when she sees me  how lucky can one fellow be    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do    and when we go for a walk in the park  and she holds me and squeezes my hand  well go on walking for hours and talking  about all the things that we plan    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do'

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['text'])

# df['tfidf'] = [vec.toarray().flatten() for vec in tfidf_matrix]

In [123]:
df.head()

Unnamed: 0,artist,song,text
0,ABBA,She's My Kind Of Girl,look at her face its a wonderful face and it ...
1,ABBA,"Andante, Andante",take it easy with me please touch me gently l...
2,ABBA,As Good As New,ill never know why i had to go why i had to p...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...


In [124]:
df[df['text'].str.contains('love', case=False)].head()

Unnamed: 0,artist,song,text
2,ABBA,As Good As New,ill never know why i had to go why i had to p...
3,ABBA,Bang,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...
7,ABBA,Chiquitita,chiquitita tell me whats wrong youre enchaine...
10,ABBA,Dance,oh my love it makes me sad why did things tur...


In [125]:
inp_song = df.query("artist == 'Metallica' and song == 'Master Of Puppets'")

query_index = inp_song.index

df.iloc[query_index]

Unnamed: 0,artist,song,text
12818,Metallica,Master Of Puppets,end of passion play crumbling away im your so...


In [126]:
# find similar songs
from sklearn.metrics.pairwise import cosine_similarity

query_vec = tfidf_matrix[query_index]
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
top_indices = similarities.argsort()[::-1][1:10]  # Top 5, excluding itself

df.iloc[top_indices]

Unnamed: 0,artist,song,text
12817,Metallica,Master Of Disaster,you should run away when you see me play im t...
45821,Nightwish,Wishmaster,master apprentice heartborne 7th seeker war...
30538,Dream Theater,In Presence Of Enemies Pt. 2,welcome tired pilgrim into the circle we hav...
29546,Depeche Mode,Master And Servant,theres a new game we like to play you see a ...
46024,Nirvana,Downer,portray sincerity act out of loyalty defend ...
30298,Doobie Brothers,The Master,just dont know why i keep on tryin must be a ...
29305,Def Leppard,Answer To The Master,when the night time unfolds and the memories t...
25613,Black Sabbath,Master Of Insanity,look all around cant you open your eyes voic...
15079,Oscar Hammerstein,My Lord And Master,spoken the king is pleased he is pleased with...
