## Importing Libraries

In [None]:
import pandas as pd
from langdetect import detect
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
df = pd.read_csv('spotify_millsongdata.csv')
df1 = pd.read_csv('playlist_songs_with_lyrics.csv')
df.head()
df1.head()


Unnamed: 0,artist,song,lyrics
0,Diljit Dosanjh,"Naina (From ""Crew"")","Verse 1: Diljit Dosanjh Ho, ruthiya khuda-khud..."
1,Aashir Wajahat,Sadqay,Verse 1: NAYEL تھوڑا سا تو فاصلہ رکھ آس نہ رکھ...
2,Charly Black,Gyal You a Party Animal,"Intro Its Charly My girl, come flip it like a ..."
3,Shaan,You're My Love,"Verse 1: Lizzy, Nana I think I’ve fallen, I ..."
4,Sushin Shyam,"Illuminati - From ""Aavesham""",Lyrics not found.


In [None]:
df.tail()
df1.tail()

Unnamed: 0,artist,song,lyrics
249,AP Dhillon,Wo Noor,Verse 1 Gahiri jihi avaaz koi Mere dil chon ma...
250,Hasan Raheem,Wishes,"Verse 1: Hasan Raheem Too many wishes, too man..."
251,Tiësto,Pump It Louder,"Intro Ha, ha, ha Pump it Ha, ha, ha And pump i..."
252,Seedhe Maut,Naksha,"Seedhe Maut Naksha के बोल Intro हा-हा, हा-हा, ..."
253,Travis Scott,CAN'T SAY,"Chorus: Travis Scott & Don Toliver No, you c..."


In [None]:
print("Shape: " , df1.shape)

Shape:  (254, 3)


In [None]:
df1.isnull().sum()

artist    0
song      0
lyrics    0
dtype: int64

In [None]:
#Down Sampling our dataset
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)


In [None]:
df1['text'] = df1['lyrics']
df1.drop('lyrics', axis=1, inplace=True)

In [None]:
df1.head()


Unnamed: 0,artist,song,text
0,Charly Black,Gyal You a Party Animal,"Intro Its Charly My girl, come flip it like a ..."
1,Shaan,You're My Love,"Verse 1: Lizzy, Nana I think I’ve fallen, I ..."
2,Amit Trivedi,Chokra Jawaan,"Verse Uh, Im sorry for your loss Its a body de..."
3,Yasser Desai,"Makhna - From ""Drive""","Verse 1: Royce da 59 Soon as I was born, I kne..."
4,Yashita Sharma,Gallan Goodiyaan,Main daalun taal pe bhangra tu bhi gidda paa l...


In [None]:
df1 = df1[df1['text'] != "Lyrics not found."]

df1 = df1[df1['text'].apply(lambda x: detect(x) == 'en')].reset_index(drop=True)

df1.to_csv('cleaned_playlist_songs_with_lyrics.csv', index=False, encoding='utf-8')

df1.head()


Unnamed: 0,artist,song,text
0,Charly Black,Gyal You a Party Animal,"Intro Its Charly My girl, come flip it like a ..."
1,Shaan,You're My Love,"Verse 1: Lizzy, Nana I think I’ve fallen, I ..."
2,Amit Trivedi,Chokra Jawaan,"Verse Uh, Im sorry for your loss Its a body de..."
3,Yasser Desai,"Makhna - From ""Drive""","Verse 1: Royce da 59 Soon as I was born, I kne..."
4,Yashita Sharma,Gallan Goodiyaan,Main daalun taal pe bhangra tu bhi gidda paa l...


## Text Preprocessing

In [None]:
df['text']=df['text'].str.lower().replace(r'^\w\s', '').replace(r'\n', '',regex=True)

df1['text']=df1['text'].str.lower().replace(r'^\w\s', '').replace(r'\n', '',regex=True)

In [None]:
df1.head()

Unnamed: 0,artist,song,text
0,Charly Black,Gyal You a Party Animal,"intro its charly my girl, come flip it like a ..."
1,Shaan,You're My Love,"verse 1: lizzy, nana i think i’ve fallen, i ..."
2,Amit Trivedi,Chokra Jawaan,"verse uh, im sorry for your loss its a body de..."
3,Yasser Desai,"Makhna - From ""Drive""","verse 1: royce da 59 soon as i was born, i kne..."
4,Yashita Sharma,Gallan Goodiyaan,main daalun taal pe bhangra tu bhi gidda paa l...


## Tokenization and Stemming

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\omsin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
stemmer = PorterStemmer()

In [None]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a = [stemmer.stem(word) for word in token]
    return " ".join(a)

# # example
# token("you are beautiful, beauty")

In [None]:
df1['text'].apply(lambda x: token(x))

0     intro it charli my girl , come flip it like a ...
1     vers 1 : lizzi , nana i think i ’ ve fallen , ...
2     vers uh , im sorri for your loss it a bodi dea...
3     vers 1 : royc da 59 soon as i wa born , i knew...
4     main daalun taal pe bhangra tu bhi gidda paa l...
                            ...                        
58    1. playboi carti - sky 2. drake - idgaf ( feat...
59    intro : travi scott yeah 7:30 in the night , y...
60    part i intro : drake astro , yeah vers : drake...
61    intro ha , ha , ha pump it ha , ha , ha and pu...
62    choru : travi scott & don toliv no , you cant ...
Name: text, Length: 63, dtype: object

## Vectorization and Calculating Cosine Similarity

In [None]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [None]:
matrix = tfid.fit_transform(df1['text'])

In [None]:
similar = cosine_similarity(matrix)

In [None]:
similar[0]

array([1.        , 0.01356447, 0.04119696, 0.02611811, 0.00821515,
       0.00522939, 0.02998058, 0.03317055, 0.01386294, 0.02277433,
       0.01639878, 0.04513396, 0.02558494, 0.04624154, 0.03848879,
       0.01428135, 0.04128622, 0.02083569, 0.0201196 , 0.0390649 ,
       0.01161515, 0.01246002, 0.01117525, 0.02073865, 0.01562723,
       0.00506569, 0.03187806, 0.01007706, 0.01138477, 0.01318946,
       0.03474786, 0.00427473, 0.00602534, 0.01070103, 0.00675479,
       0.01422026, 0.04408317, 0.02153205, 0.01516914, 0.01707134,
       0.01095312, 0.06100638, 0.03373349, 0.00409023, 0.03450407,
       0.04381564, 0.02917534, 0.02185619, 0.02655975, 0.00943678,
       0.00298968, 0.01365851, 0.02558898, 0.02047273, 0.00683095,
       0.00241096, 0.01601782, 0.03188972, 0.        , 0.02060198,
       0.0266025 , 0.0072275 , 0.01098103])

## Recommender Function

In [None]:
def recommender(song_name):
    if song_name not in df1['song'].values:
        return f"Error: '{song_name}' not found in the dataset."

    idx = df1[df1['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), key=lambda x: x[1], reverse=True)
    recommended_songs = []

    for i in distance[1:6]:
        recommended_songs.append(df1.iloc[i[0]]['song'])

    return recommended_songs

## Pickle Dump for our Application


In [None]:
pickle.dump(similar, open('similarityplay.pkl', 'wb'))
pickle.dump(df, open('dfplay.pkl', 'wb'))