In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import re


In [2]:
df = pd.read_csv("Spotify Million Song Dataset_exported.csv") 
df.drop(columns=['link'],inplace=True)

# Data Preprocessing

In [3]:

def clean_lyrics(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        return text.strip()
    return text

df['text'] = df['text'].apply(clean_lyrics)

In [4]:

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text'])


In [None]:
def find_song_by_snippet(snippet, df, vectorizer, tfidf_matrix):
    snippet = snippet.lower().strip()
    snippet_vector = vectorizer.transform([snippet])
    similarities = cosine_similarity(snippet_vector, tfidf_matrix).flatten()
    most_similar_index = similarities.argmax()
    if similarities[most_similar_index] > 0.2:  # Threshold to avoid weak matches
        return df.iloc[most_similar_index]['song'], df.iloc[most_similar_index]['artist']
    return "Song not found", "Artist not found"

# Output Interface

In [5]:

def identify_song(snippet):
    song, artist = find_song_by_snippet(snippet, df, vectorizer, tfidf_matrix)
    return f"Song: {song}\nArtist: {artist}"


iface = gr.Interface(
    fn=identify_song, 
    inputs=gr.Textbox(lines=2, placeholder="Enter a snippet of lyrics..."), 
    outputs="text",
    title="Spotify Song Identifier",
    description="Enter a snippet of lyrics to identify the song and artist "
)


iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


