<a href="https://colab.research.google.com/github/Shreyanshy53/Spotify_Lyric_Search/blob/main/Spotify_Lyric_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import os
import pandas as pd

folder = "/content/drive/MyDrive/spotify_lyric_search"

csv_file = [f for f in os.listdir(folder) if f.endswith(".csv")][0]
data_path = os.path.join(folder, csv_file)

print("Using file:", data_path)

df = pd.read_csv(data_path)
print(df.shape)
df.head()


Using file: /content/drive/MyDrive/spotify_lyric_search/Spotify_Million_Song_Dataset_exported.csv
(57650, 4)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [12]:
df = df.rename(columns={
    "artist": "artist_name",
    "song": "track_name",
    "text": "lyrics"
})

df = df[["track_name", "artist_name", "lyrics"]]
df.head()


Unnamed: 0,track_name,artist_name,lyrics
0,Ahe's My Kind Of Girl,ABBA,"Look at her face, it's a wonderful face \nAnd..."
1,"Andante, Andante",ABBA,"Take it easy with me, please \nTouch me gentl..."
2,As Good As New,ABBA,I'll never know why I had to go \nWhy I had t...
3,Bang,ABBA,Making somebody happy is a question of give an...
4,Bang-A-Boomerang,ABBA,Making somebody happy is a question of give an...


In [13]:
print(df.isna().sum())
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
print("After cleaning:", df.shape)


track_name     0
artist_name    0
lyrics         0
dtype: int64
After cleaning: (57650, 3)


In [14]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

df["cleaned_lyrics"] = df["lyrics"].apply(preprocess)
df[["track_name", "cleaned_lyrics"]].head()


Unnamed: 0,track_name,cleaned_lyrics
0,Ahe's My Kind Of Girl,look face wonderful face means something speci...
1,"Andante, Andante",take easy please touch gently like summer even...
2,As Good As New,ill never know go put lousy rotten show boy to...
3,Bang,making somebody happy question give take learn...
4,Bang-A-Boomerang,making somebody happy question give take learn...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(df["cleaned_lyrics"])
print("TF-IDF shape:", X.shape)


TF-IDF shape: (57650, 5000)


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_song(lyric_snippet, top_k=1):
    snippet = preprocess(lyric_snippet)
    snippet_vec = vectorizer.transform([snippet])

    similarity = cosine_similarity(snippet_vec, X)[0]
    top_indices = similarity.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        results.append({
            "song": df.iloc[idx]["track_name"],
            "artist": df.iloc[idx]["artist_name"],
            "similarity": similarity[idx]
        })
    return results


In [17]:
predict_song("hello from the other side I must have called a thousand times")


[{'song': 'A Girl Like You',
  'artist': 'Tom Jones',
  'similarity': np.float64(0.540285048569458)}]

In [19]:
correct = 0
samples = 100

test_df = df.sample(samples, random_state=42)

for _, row in test_df.iterrows():
    lyrics = row["lyrics"]
    mid = len(lyrics) // 2
    snippet = lyrics[mid: mid+150]

    pred = predict_song(snippet)[0]
    if pred["song"] == row["track_name"]:
        correct += 1

accuracy = correct / samples
accuracy


0.61