### Importing & Loading what we need

In [1]:
import pandas as pd
import nltk
import pickle
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Loading The DataSet
data = pd.read_csv("Music Recomendation System/spotify_millsongdata.csv")

In [2]:
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
data.shape

(57650, 4)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


### Checking For Missing Values

In [5]:
data.isna().sum()

artist    0
song      0
link      0
text      0
dtype: int64

### Droping Unwanted Columns

In [6]:
# Take a sample of 5000 rows, drop 'link' column, and reset index
data = data.sample(5000).drop(columns=['link']).reset_index(drop=True)  

In [7]:
data.head()

Unnamed: 0,artist,song,text
0,Jason Mraz,Outdoors,All day I've been inside \r\nAnd I've got the...
1,Incognito,Smile,There's no song like a love song \r\n(Love so...
2,John Denver,Old Train,"Old Train, I can hear your whistle blow \r\nA..."
3,Dan Fogelberg,Lovers In A Dangerous Time,The hours grow shorter as the days go by \r\n...
4,John Martyn,Go Easy,"Looking at me, never find out what a working m..."


In [8]:
data['text'][0]  # Display the lyrics of the first song in the dataset

"All day I've been inside  \r\nAnd I've got the feeling  \r\nI'm trapped between the walls  \r\nAnd underneath the ceiling  \r\nI feel a bit off track  \r\nAnd now I'm trying to get back  \r\n  \r\nBack in the cool cool air  \r\nWhere the sun in shining  \r\nNothing's gonna stop me  \r\nIt's all in the timing  \r\nIt's finally again my turn  \r\nIt's time to return  \r\n  \r\nCause I won't stay inside  \r\nNo more no more  \r\nI cannot wait to go outdoors  \r\nMmmhmmhmmhmm  \r\nHey hey hey (Elmo)  \r\n  \r\nWell open up your door  \r\nAnd be like me  \r\nOpen up your door  \r\nAnd then breathe free  \r\nLook at all the beauty you'll feel  \r\nLove love love love  \r\n  \r\nListen to the music of the wind  \r\nAnd the birdies sing  \r\nWe're just one big family  \r\nAnd all of nature deserves to be  \r\nLoved loved loved loved loved  \r\n  \r\nSo I won't stay inside  \r\nNo more no more  \r\nIt cannot wait  \r\nI'm sure  \r\nThere's no need to run and hide  \r\nLet's go explore  \r\nIt 

### Convert text to lowercase & Replacing unwanted characters like '\n' with a space

In [9]:
data['text'] = data['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex=True)

In [10]:
data.tail(5)

Unnamed: 0,artist,song,text
4995,Backstreet Boys,PDA,intimacy's \r fresh from my dreams \r over a...
4996,Human League,Never Let Me Go,go go go \r go go go \r \r let me tell you...
4997,Eminem,It's Murda,eminem: \r \r i'll shove a gun in your gril...
4998,Lloyd Cole,Mannish Girl,i used to be content to frown \r but anything...
4999,Michael W. Smith,Angels Unaware,maybe there's a light in my soul \r maybe it ...


### Initialize the PorterStemmer for stemming

In [11]:
port_stem = PorterStemmer()

### Define a function to tokenize and stem the lyrics

In [12]:
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)  # Tokenize the text
    stemming = [port_stem.stem(w) for w in tokens]  # Stem each token
    return " ".join(stemming)  # Join the stemmed tokens into a single string

### Applying the tokenization function to the 'text' column

In [13]:
data['text'] = data['text'].apply(lambda x: tokenization(x))

### Create a TF-IDF matrix for the lyrics

In [14]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')  # Initialize TF-IDF Vectorizer with English stopwords
matrix = vectorizer.fit_transform(data['text'])  # Fit and transform the 'text' column to create the TF-IDF matrix

### Compute the cosine similarity between all songs

In [17]:
similarity = cosine_similarity(matrix)
similarity[0]  # Check similarity scores for the first song

array([1.        , 0.09601823, 0.04683935, ..., 0.04449815, 0.17035017,
       0.04702352])

In [18]:
# Check the details of the song "Crying Over You"
data[data['song'] == 'Never Let Me Go'].index[0]

4996

### Saving the Similarity Matrix and Dataframe Using pickle for later use


In [19]:
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(data,open('data.pkl','wb'))

### Define a recommendation function

In [20]:
def recommendation(song_df):
    idx = data[data['song'] == song_df].index[0]  # Find the index of the given song
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])  # Sort by similarity scores
    
    # Get the top 20 recommended songs
    song = []
    for s_id in distances[1:21]:  # Skip the first item (itself)
        song.append(data.iloc[s_id[0]].song)
        
    return song

In [21]:
recommendation('Never Let Me Go')

['Let Me Let Go',
 'Let It Down',
 'Let It Go',
 "I'd Have You Anytime",
 'Let Go',
 'Dreamland Express',
 'I Let Love In',
 'Let Her Go',
 "I Can't Let Go",
 'Get Back',
 'I Will Be There',
 "Let's Take A Ride",
 'Let Me Tell You My Mind',
 'Let Me Know',
 'Let Yourself Go',
 'Let Me Love You',
 "Don't Let Me In",
 'Light Up The Sky',
 'Never Let Her Go',
 "Let's Do Something"]