# Training The Model For Music Recommender System

---



---



In [1]:
import pandas as pd

# Read the data from the Spotify Millsong dataset.
songs_dataset = pd.read_csv(r"D:\Studies\Semester 7\AI\AI\spotify_millsongdata.csv")


In [2]:
# Remove the 'link' column from the dataset as it is not needed for our operations.
new_songs_dataset = songs_dataset.drop('link', axis=1).reset_index(drop=True)


In [3]:
# Provide replacement for unwanted characters with a single space.
new_songs_dataset['text'] = (
    new_songs_dataset['text']
    .str.lower()
    .replace(r'[^a-z0-9\s]', ' ', regex=True)
    .replace(r'\n', ' ', regex=True)
    .replace(r'\r', ' ', regex=True)
    .replace(r'\s+', ' ', regex=True)
    .str.strip()
)


In [4]:
import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

# Apply tokenization into dataset.
new_songs_dataset['text'] = new_songs_dataset['text'].apply(lambda x: tokenization(x))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ahsan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Get the first 20000 songs as the dataset.
first_20000_songs_dataset = new_songs_dataset.head(5000)

# Transfer the tokenized text into a vector and find the minimum angular distance.
vector = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = vector.fit_transform(first_20000_songs_dataset['text'])
distance_similarity = cosine_similarity(matrix)


In [6]:
def recommendation(lyrics_input):
    # Preprocess the input lyrics
    lyrics_input = tokenization(lyrics_input)

    # Calculate similarity between input lyrics and dataset
    similarity_with_input = cosine_similarity(vector.transform([lyrics_input]), matrix).flatten()

    # Get the indices of songs sorted by similarity
    indices_sorted_by_similarity = similarity_with_input.argsort()[::-1]

    recommended_songs = []
    for i in indices_sorted_by_similarity[1:6]:
        recommended_songs.append(first_20000_songs_dataset.iloc[i].song)

    return recommended_songs


In [7]:
# Example usage:
recommended_songs = recommendation("She's just my kind of girl, she makes me feel fine Who could ever believe that she could be mine?  She's just my kind of girl, without her I'm blue  And if she ever leaves me what could I do, what could I do?  ")
print("Recommended Songs:", recommended_songs)


Recommended Songs: ['Be Kind To Me', 'I Am Just A Girl', "That's Me", 'Kimono Girl', 'Famous Girl']


In [8]:
# ... (existing code)

# Transfer the tokenized text into a vector and find the minimum angular distance.
vector = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = vector.fit_transform(first_20000_songs_dataset['text'])
distance_similarity = cosine_similarity(matrix)

# Dump necessary data to pickle files
import pickle
pickle.dump(distance_similarity, open('distance_similarity.pkl', 'wb'))
pickle.dump(first_20000_songs_dataset, open('first_20000_songs_dataset.pkl', 'wb'))
