In [1]:
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sankar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Load dataset
data = pd.read_csv("../data/Spotify Million Song Dataset_exported.csv")

# Keep only required columns
data = data[["artist", "song", "text"]]

# Remove rows with missing values
data.dropna(inplace=True)

data.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [9]:
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    
    # remove punctuation and numbers
    text = re.sub(r"[^a-z\s]", "", text)
    
    # split text into words
    words = text.split()
    
    # remove stopwords
    words = [w for w in words if w not in stop_words]
    
    return " ".join(words)

In [11]:
data["clean_lyrics"] = data["text"].apply(clean_text)

data[["artist", "song", "clean_lyrics"]].head()

Unnamed: 0,artist,song,clean_lyrics
0,ABBA,Ahe's My Kind Of Girl,look face wonderful face means something speci...
1,ABBA,"Andante, Andante",take easy please touch gently like summer even...
2,ABBA,As Good As New,ill never know go put lousy rotten show boy to...
3,ABBA,Bang,making somebody happy question give take learn...
4,ABBA,Bang-A-Boomerang,making somebody happy question give take learn...


In [13]:
tfidf = TfidfVectorizer(
    max_features=80000,
    ngram_range=(1, 4),
    min_df=2
)

lyrics_vectors = tfidf.fit_transform(data["clean_lyrics"])

lyrics_vectors.shape

(57650, 80000)

In [14]:
def find_song(lyrics_snippet, top_n=1):
    
    # preprocess input text
    cleaned_snippet = clean_text(lyrics_snippet)
    
    # convert snippet to TF-IDF vector
    snippet_vector = tfidf.transform([cleaned_snippet])
    
    # calculate similarity with all songs
    similarity_scores = cosine_similarity(snippet_vector, lyrics_vectors)
    
    # get top matching index
    best_matches = similarity_scores[0].argsort()[-top_n:][::-1]
    
    results = []
    
    for idx in best_matches:
        results.append({
            "Song": data.iloc[idx]["song"],
            "Artist": data.iloc[idx]["artist"],
            "Score": round(similarity_scores[0][idx], 3)
        })
    
    return results

In [17]:
test_lyrics = "I'll never know why I had to go"

prediction = find_song(test_lyrics)

for item in prediction:
    print("Song   :", item["Song"])
    print("Artist :", item["Artist"])
    print("Score  :", item["Score"])

Song   : Coastline
Artist : America
Score  : 0.487


In [32]:
def check_accuracy(samples=100):
    correct = 0
    
    random_rows = np.random.choice(len(data), samples, replace=False)
    
    for i in random_rows:
        full_lyrics = data.iloc[i]["text"]
        words = full_lyrics.split()
        
        # take a part of lyrics
        if len(words) < 40:
            continue

        snippet = " ".join(words[10:40])

        
        actual_song = data.iloc[i]["song"]
        predicted_song = find_song(snippet)[0]["Song"]
        
        if actual_song == predicted_song:
            correct += 1
    
    return correct / samples

In [34]:
accuracy = check_accuracy(100)

print("Model Accuracy:", round(accuracy * 100, 2), "%")

Model Accuracy: 74.0 %
