In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [10]:
df.head(10)

Unnamed: 0,artist,song,text
0,Europe,Rainbow Warrior,"Lock the door, thin the light \r\nHere's the ..."
1,New Order,The Village,When a new life turns towards you \r\nAnd the...
2,Ian Hunter,Still The Same,They say that some people never grow up \r\nL...
3,John Denver,Darcy Farrow,Where the walker runs down to the Carson Valle...
4,America,It's Beginning To Look A Lot Like Christmas,It's beginning to look a lot like Christmas \...
5,Backstreet Boys,Crawling Back To You,Everybody knows that I was such a fool to ever...
6,Guns N' Roses,Live And Let Die,When you were young \r\nAnd your heart was an...
7,Opeth,Harvest,Stay with me a while \r\nRise above the vile ...
8,Jennifer Lopez,Love Don't Cost A Thing,You think you gotta keep me iced \r\nYou don'...
9,Conway Twitty,Born To Sing The Blues,"Born to sing the blues \r\nSo tired an' worn,..."


In [11]:
df['text'][0]

"Lock the door, thin the light  \r\nHere's the tale for you tonight  \r\nLet me take you away  \r\nWell, take my hand and follow me  \r\nMother Mary's child to be  \r\nWell, I got something to say  \r\nRainbow warrior  \r\nShine on  \r\nRainbow warrior  \r\nSign your will across the sky  \r\nFollow me 'til the day you die  \r\nOne for all and all for one  \r\nWalking through the gates of dawn  \r\nWe've been waiting so long  \r\nChildren sent from near and far  \r\nCome and see their newborn star  \r\nSee it shining so strong\r\n\r\n"

In [12]:
df.shape

(5000, 3)

In [None]:
# Text Cleaning and preprocessing


In [13]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [14]:
df['text'][0]

"lock the door, thin the light  \r here's the tale for you tonight  \r let me take you away  \r well, take my hand and follow me  \r mother mary's child to be  \r well, i got something to say  \r rainbow warrior  \r shine on  \r rainbow warrior  \r sign your will across the sky  \r follow me 'til the day you die  \r one for all and all for one  \r walking through the gates of dawn  \r we've been waiting so long  \r children sent from near and far  \r come and see their newborn star  \r see it shining so strong\r \r "

In [15]:
import nltk
# nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    try:
        tokens = nltk.word_tokenize(txt)
        stemming = [stemmer.stem(w) for w in tokens]
        return " ".join(stemming)
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return None


In [16]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [19]:
similarity[0]

array([1.        , 0.02796478, 0.01899238, ..., 0.        , 0.00718307,
       0.01203696])

In [20]:
df[df['song'] == 'Green Shirt']

Unnamed: 0,artist,song,text
4469,Elvis Costello,Green Shirt,there 's a smart young woman on a light blue s...


In [21]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs


In [22]:
recommendation('Green Shirt')

['Gonna Be Somebody',
 'Journey 2 The Center Of Your Heart',
 'Somebody Out There',
 'Clock Strikes Ten',
 'I Wanna Dance With Somebody',
 'Somebody Like That',
 'Somebody Loves You',
 "Ain't Gonna Look The Other Way",
 'Way Down Here',
 'Born To Be Somebody',
 "Tomorrow's Gonna Be A Brighter Day",
 'Drive',
 "When You're Gone",
 'Lover Please',
 'Stand By My Woman',
 'Somebody Someone',
 'Mean Sleep',
 'Peace In Our Time',
 'Please',
 'Try And Love Again']

In [None]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))