In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the file
df = pd.read_csv('spotify_millsongdata.csv')

# Drop empty lyrics
df = df.dropna(subset=['text'])

# NOTICE: I removed the line "df.sample" so it uses 100% of the data
print(f"Dataset loaded with {len(df)} songs.")

Dataset loaded with 57650 songs.


In [8]:
# Convert all lyrics to numbers
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(df['text'])

# Calculate similarity (This part is heavy on memory)
similarity = cosine_similarity(matrix)
print("Analysis complete.")

Analysis complete.


In [5]:
def recommend(song_name):
    if song_name not in df['song'].values:
        print(f"Song '{song_name}' not found.")
        return

    idx = df[df['song'] == song_name].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    print(f"Recommendations for '{song_name}':")
    for i in distances[1:6]:
        print(f"- {df.iloc[i[0]].song} by {df.iloc[i[0]].artist}")

In [9]:
# Test it
recommend("Ahe's My Kind Of Girl")

Recommendations for 'Ahe's My Kind Of Girl':
- What Kind Of Girl by Air Supply
- Not That Kind Of Love by Alice Cooper
- The Messenger by Linkin Park
- Girl Like Mine by Roy Orbison
- Marilyn Monroe by Pharrell Williams
