# Data Formatting Tasks for Database Creation

In [120]:
import pandas as pd

# Spotify Million Song Dataset - https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset
df = pd.read_csv("spotify_millsongdata.csv")

df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [121]:
df.shape

(57650, 4)

In [122]:
df.isna().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [123]:
df.drop(columns=['link'], inplace=True)

df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [124]:
# clean the text

df['text'] = df['text'].str.lower()
# remove carriage return (\r) and newline (\n) characters
df['text'] = df['text'].replace(r'\r|\n', '', regex=True)
# remove punctuation
df['text'] = df['text'].replace(r'[^\w\s]', '', regex=True) 

df['text'][0]

'look at her face its a wonderful face  and it means something special to me  look at the way that she smiles when she sees me  how lucky can one fellow be    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do    and when we go for a walk in the park  and she holds me and squeezes my hand  well go on walking for hours and talking  about all the things that we plan    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do'

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['text'])

df['tfidf'] = [vec.toarray().flatten() for vec in tfidf_matrix]

In [126]:
df.head()

Unnamed: 0,artist,song,text,tfidf
0,ABBA,Ahe's My Kind Of Girl,look at her face its a wonderful face and it ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ABBA,"Andante, Andante",take it easy with me please touch me gently l...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,ABBA,As Good As New,ill never know why i had to go why i had to p...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,ABBA,Bang,making somebody happy is a question of give an...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,ABBA,Bang-A-Boomerang,making somebody happy is a question of give an...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
