# Importing The Libraries

In [5]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


# Dataset Import 

In [6]:
raw_data = pd.read_csv('D:\\VSCODE\\ML_PROJECT\\Dataset\\spotify_millsongdata.csv')


In [7]:
raw_data

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...


In [8]:
# Is NUll
raw_data.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [9]:
# Taking Sample of dataset because of insufficent memory space
df = raw_data.sample(12323).drop(columns=['link'], axis= 1).reset_index(drop=True)

In [10]:
df

Unnamed: 0,artist,song,text
0,The Monkees,Magnolia Simms,By Michael Nesmith \r\nLove to me is blue-eye...
1,Kenny Loggins,Beauty And The Beast,Tale as old as time \r\nTrue as it can be \r...
2,Great Big Sea,Sally Ann,All eyes on Her from dawn till dusk \r\nHangi...
3,Marillion,This Town,The cars leave their trails of hot and cold li...
4,Culture Club,Stormkeeper,Ooh I have love sweeter than lies \r\nGave yo...
...,...,...,...
12318,Marillion,Hope For The Future,I've been feeling kind of down and loose \r\n...
12319,George Harrison,Devil's Radio,"Gossip, gossip \r\nGossip, gossip \r\n \r\n..."
12320,Nat King Cole,All For You,"When you raise your eyes and make believe, \r..."
12321,O.A.R.,Ladanday,Now Peter sat on the mountaintop \r\nNorthern...


In [11]:
df['text'][0]

"By Michael Nesmith  \r\nLove to me is blue-eyed and blonde.  \r\nOh, that's sweet Magnolia.  \r\nApple pie on the window still warm.  \r\nThat's my sweet Magnolia  \r\n  \r\nWalking under a sky that's so blue  \r\nAfter rain has fallen.  \r\nWhen she's walking so close by my side  \r\nMy troubles seem to just run and hide.  \r\n  \r\n[la-dee, da-dumb, etc.]  \r\nWell, walking under a sky that's so blue  \r\nAfter rain has fallen.  \r\nWhen she's walking so close by my side  \r\nMy troubles seem to just run and hide.  \r\n  \r\nMagnolia Simms is my little doll.  \r\nI can't live without her.  \r\nFor if she goes my world will just fall.  \r\nStay with me, Magnolia.  \r\n[Sound of record skipping]  \r\nStay with me, Magnolia.\r\n\r\n"

In [12]:
# data processing 
df['text'] = df['text'].str.lower().replace(r"^\w\s", '',regex=True).replace(r"\r\n",', ',regex=True)

In [13]:
df

Unnamed: 0,artist,song,text
0,The Monkees,Magnolia Simms,"by michael nesmith , love to me is blue-eyed ..."
1,Kenny Loggins,Beauty And The Beast,"tale as old as time , true as it can be , ba..."
2,Great Big Sea,Sally Ann,"all eyes on her from dawn till dusk , hanging..."
3,Marillion,This Town,the cars leave their trails of hot and cold li...
4,Culture Club,Stormkeeper,"ooh i have love sweeter than lies , gave you ..."
...,...,...,...
12318,Marillion,Hope For The Future,"i've been feeling kind of down and loose , li..."
12319,George Harrison,Devil's Radio,"gossip, gossip , gossip, gossip , , i hear..."
12320,Nat King Cole,All For You,"when you raise your eyes and make believe, , ..."
12321,O.A.R.,Ladanday,"now peter sat on the mountaintop , northern w..."


# Stemmer

In [14]:
# NLTK
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [15]:
# Define the tokenizer and stemmer function
def token(text):
    tokens = nltk.word_tokenize(text)  # Tokenize the text
    stemmed_words = [stemmer.stem(item) for item in tokens]  # Apply stemming
    return " ".join(stemmed_words)  # Join the stemmed words into a single string

In [16]:
token("You are amazing and your work is amazing.")

'you are amaz and your work is amaz .'

# Lemmatizer 

In [17]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the lemmatization function
def lemmatize_sentence(sentence):
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    
    # Lemmatize each token with verb POS (part of speech) for best results with verbs
    lemmatized_words = [lemmatizer.lemmatize(word, pos="v") for word in tokens]
    
    return lemmatized_words

# TF-IDF (Term Frequency-Inverse Document Frequency) Vectorizer : Evaluate how important a word is to a document within a collection (or corpus) of documents 

In [18]:
# Stemming Vectorization
token_df = df['text'].apply(lambda x : token(x))

In [19]:
# # Lemmatize Vectorization
# lemmatize_df = df['text'].apply(lambda x : lemmatize_sentence(x))

In [20]:
token_df[0]

"by michael nesmith , love to me is blue-ey and blond . , oh , that 's sweet magnolia . , appl pie on the window still warm . , that 's my sweet magnolia , , walk under a sky that 's so blue , after rain ha fallen . , when she 's walk so close by my side , my troubl seem to just run and hide . , , [ la-de , da-dumb , etc . ] , well , walk under a sky that 's so blue , after rain ha fallen . , when she 's walk so close by my side , my troubl seem to just run and hide . , , magnolia simm is my littl doll . , i ca n't live without her . , for if she goe my world will just fall . , stay with me , magnolia . , [ sound of record skip ] , stay with me , magnolia. , ,"

In [21]:
# TF-IDF Vectorizer and cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [22]:
tf_idf = TfidfVectorizer(analyzer='word',stop_words='english')

In [23]:
# Model Trained 
matrix = tf_idf.fit_transform(token_df)

In [24]:
def batch_cosine_similarity(matrix, batch_size=1000):
    n = matrix.shape[0]
    similarities = np.zeros((n, n))
    for i in range(0, n, batch_size):
        end_i = min(i + batch_size, n)
        similarities[i:end_i] = cosine_similarity(matrix[i:end_i], matrix)
    return similarities

# Usage
similarities = batch_cosine_similarity(matrix)

In [25]:
similarities[0]

array([1.        , 0.02263898, 0.01709689, ..., 0.02799659, 0.01462707,
       0.02644462])

# Recommender Function

In [26]:
def recommender(song_name):
    idx = df[df['song'] == song_name].index[0]
    # Sort the similarities in descending order based on similarity scores
    distance = sorted(list(enumerate(similarities[idx])), reverse=True, key=lambda x: x[1])
    
    song = []
    # Get the top 5 most similar songs, excluding the first one (which is the song itself)
    for s_id in distance[1:6]:
        song.append(df.iloc[s_id[0]].song)
    
    return song


In [27]:
recommender('I Love You')

['I Love You, Earth',
 'Love',
 "There's Nothing Better Than Love",
 'I Do Love You',
 'Who Do You Love']

In [28]:
import pickle
pickle.dump(similarities,open("similarities.pkl",'wb'))
pickle.dump(df,open("df.pkl","wb"))