# Taylor Swift Song Recommendation System

In [16]:
import re
import time
import nltk
import base64
import random
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Required keys

In [1]:
# Client ID and Client Secret from your Spotify Developer Dashboard and Genius API
client_id="0c64bcdd0c2a4358bc87d3bcc131ead0"
client_secret="21dbcb4e2819426f84774f107e3afd30"
genius_id = "0_wLcdeAud59txTzl5JFX3sviHZ1AJ_eXDkGgckvKwCo1zXEVk-b_wgKoIgFGPWA"
genius_secret = "_3AifiMFK0O66f0OpsXR0XTfHTBS6oyMXbc2VjHpsid2WYSS79MC88-0VVlOdWw5UgHhe3BULyzufwc9dmuTTA"
genius_token = "2rjwQEla2H09EoEfojKm1sB0naqVQGt0m1ZopxDsg25dXBAzD7wMadTrmawElSOt"

### Data Collection

In [3]:
# ----------------------------------------------
# 1. Spotify API - Get All Taylor Swift's Tracks
# ----------------------------------------------

# Spotify API credentials
spotify_client_id = client_id
spotify_client_secret = client_secret

# Function to get Spotify access token
def get_spotify_token(client_id, client_secret):
    client_creds = f"{client_id}:{client_secret}"
    client_creds_b64 = base64.b64encode(client_creds.encode())

    token_url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": f"Basic {client_creds_b64.decode()}"
    }
    data = {
        "grant_type": "client_credentials"
    }

    response = requests.post(token_url, headers=headers, data=data)
    return response.json().get('access_token')

# Get Spotify access token
spotify_token = get_spotify_token(spotify_client_id, spotify_client_secret)

# Function to get Taylor Swift's Artist ID
def get_artist_id(artist_name):
    search_url = "https://api.spotify.com/v1/search"
    headers = {
        "Authorization": f"Bearer {spotify_token}"
    }
    params = {
        "q": artist_name,
        "type": "artist",
        "limit": 1
    }

    response = requests.get(search_url, headers=headers, params=params)
    artist_info = response.json()
    return artist_info['artists']['items'][0]['id']

# Get Taylor Swift's artist ID
artist_id = get_artist_id("Taylor Swift")

# Function to get all tracks by an artist
def get_all_tracks(artist_id):
    albums_url = f"https://api.spotify.com/v1/artists/{artist_id}/albums"
    headers = {
        "Authorization": f"Bearer {spotify_token}"
    }
    params = {
        "include_groups": "album,single",
        "limit": 50
    }

    response = requests.get(albums_url, headers=headers, params=params)
    albums = response.json()
    tracks = []

    for album in albums['items']:
        album_id = album['id']
        album_name = album['name']
        
        # Get tracks for each album
        tracks_url = f"https://api.spotify.com/v1/albums/{album_id}/tracks"
        response = requests.get(tracks_url, headers=headers)
        album_tracks = response.json()
        
        for track in album_tracks['items']:
            tracks.append({
                'track_name': track['name'],
                'album_name': album_name,
                'track_id': track['id'],
                'duration_ms': track['duration_ms']
            })
    
    return tracks

# Fetch all tracks by Taylor Swift
tracks = get_all_tracks(artist_id)

# -------------------------------------------------
# 2. Spotify Audio Features API - Get Song Mood Data
# -------------------------------------------------

# Function to get audio features for a track
def get_audio_features(track_ids):
    features_url = "https://api.spotify.com/v1/audio-features"
    headers = {
        "Authorization": f"Bearer {spotify_token}"
    }
    
    audio_features = []
    # Fetch in batches of up to 100 track_ids (Spotify API limit)
    for i in range(0, len(track_ids), 100):
        ids_batch = ",".join(track_ids[i:i+100])
        response = requests.get(features_url, headers=headers, params={'ids': ids_batch})
        
        if response.status_code == 200:
            features = response.json()['audio_features']
            if features:
                audio_features += features
            else:
                print(f"No features found for batch {i // 100 + 1}.")
        else:
            print(f"Error fetching features: {response.status_code} - {response.text}")
        
        # Sleep to avoid rate limits
        time.sleep(0.2)  # Spotify API allows 5 requests per second
    
    return audio_features

def get_audio_features_for_mood(track_id):
    features_url = f"https://api.spotify.com/v1/audio-features/{track_id}"
    headers = {
        "Authorization": f"Bearer {spotify_token}"
    }

    response = requests.get(features_url, headers=headers)
    if response.status_code == 200:
        return response.json()
    return None
    
# Function to predict mood based on audio features
def predict_mood(valence, energy):
    if valence > 0.8:
        return "Happy"
    elif valence < 0.35:
        return "Sad"
    elif energy > 0.7:
        return "Energetic"
    else:
        return "Calm"

# -------------------------------------------
# 3. Genius API - Get Lyrics for Each Track
# -------------------------------------------

# Function to search for lyrics on Genius
def get_lyrics(song_title, artist_name):
    base_url = "https://api.genius.com"
    headers = {
        "Authorization": f"Bearer {genius_token}"
    }
    
    search_url = base_url + "/search"
    params = {'q': f"{song_title} {artist_name}"}
    response = requests.get(search_url, headers=headers, params=params)
    
    if response.status_code != 200:
        return None  # No lyrics found
    
    search_results = response.json()
    if len(search_results['response']['hits']) == 0:
        return None
    
    # Get the URL of the first matching song on Genius
    song_url = search_results['response']['hits'][0]['result']['url']
    return song_url

# Function to extract lyrics from Genius song URL (using web scraping)
def extract_lyrics(lyrics_url):
    response = requests.get(lyrics_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Genius has a new structure for lyrics in <div class='Lyrics__Container'>
    lyrics = ""
    for div in soup.find_all("div", class_=re.compile("^Lyrics__Container")):
        for br in div.find_all("br"):
            br.replace_with("\n")  # Handle line breaks
        lyrics += div.get_text(separator="\n").strip()

    if not lyrics:
        return None
    
    return lyrics.strip()

# -----------------------------------------------------
# 4. Combine Track Data with Lyrics and Mood Prediction
# -----------------------------------------------------

tracks_with_lyrics_and_mood = []

for track in tracks:
    track_name = track['track_name']
    artist_name = "Taylor Swift"  # All tracks are by Taylor Swift
    
    # Fetch lyrics for each track
    print(f"Fetching lyrics for: {track_name}")
    lyrics_url = get_lyrics(track_name, artist_name)
    
    if lyrics_url:
        lyrics = extract_lyrics(lyrics_url)
        if lyrics:
            print(f"Lyrics found for '{track_name}'")
        else:
            print(f"Lyrics not available for '{track_name}'")
    else:
        lyrics = None
        print(f"No lyrics found for '{track_name}'")
    
    # Fetch audio features and predict mood
    audio_features = get_audio_features_for_mood(track['track_id'])
    if audio_features:
        valence = audio_features.get('valence', 0)
        energy = audio_features.get('energy', 0)
        mood = predict_mood(valence, energy)
    else:
        mood = "Unknown"
    
    # Append lyrics and mood to the track data
    track['lyrics'] = lyrics
    track['mood'] = mood
    tracks_with_lyrics_and_mood.append(track)
    
    # Be mindful of rate limits by adding delay
    time.sleep(1)

# Convert to DataFrame and save
df = pd.DataFrame(tracks_with_lyrics_and_mood)

# Get the list of track IDs
track_ids = df['track_id'].tolist()

# Fetch audio features
audio_features = get_audio_features(track_ids)

# Convert audio features into a DataFrame
audio_features_df = pd.DataFrame(audio_features)

# Merge the audio features with the original dataset based on 'track_id'
df = pd.merge(df, audio_features_df, left_on='track_id', right_on = 'id', how='left')

df.to_csv('taylor_swift_tracks_dataset.csv', index=False)

Fetching lyrics for: Fortnight (feat. Post Malone)
Lyrics found for 'Fortnight (feat. Post Malone)'
Fetching lyrics for: The Tortured Poets Department
Lyrics found for 'The Tortured Poets Department'
Fetching lyrics for: My Boy Only Breaks His Favorite Toys
Lyrics found for 'My Boy Only Breaks His Favorite Toys'
Fetching lyrics for: Down Bad
Lyrics found for 'Down Bad'
Fetching lyrics for: So Long, London
Lyrics found for 'So Long, London'
Fetching lyrics for: But Daddy I Love Him
Lyrics found for 'But Daddy I Love Him'
Fetching lyrics for: Fresh Out The Slammer
Lyrics found for 'Fresh Out The Slammer'
Fetching lyrics for: Florida!!! (feat. Florence + The Machine)
Lyrics found for 'Florida!!! (feat. Florence + The Machine)'
Fetching lyrics for: Guilty as Sin?
Lyrics found for 'Guilty as Sin?'
Fetching lyrics for: Who’s Afraid of Little Old Me?
Lyrics found for 'Who’s Afraid of Little Old Me?'
Fetching lyrics for: I Can Fix Him (No Really I Can)
Lyrics found for 'I Can Fix Him (No Reall

### Recommendations

In [17]:
# Load the dataset with lyrics, mood, and audio features
df = pd.read_csv('taylor_swift_tracks_dataset.csv')

# Preprocess: Replace NaN lyrics, mood with empty strings or defaults
df['lyrics'] = df['lyrics'].fillna('')
df['mood'] = df['mood'].fillna('neutral')  # Default mood

# Select audio features to include in similarity calculation
audio_features = ['valence', 'energy', 'danceability', 'acousticness', 'tempo']

# Normalize audio features (bring all features to a common scale)
scaler = MinMaxScaler()
df[audio_features] = scaler.fit_transform(df[audio_features])

# Lemmatizer to normalize words
lemmatizer = WordNetLemmatizer()

# Custom function to preprocess lyrics (tokenization + lemmatization)
def preprocess_lyrics(lyrics):
    # Tokenize the lyrics
    tokens = word_tokenize(lyrics)
    
    # Lemmatize each word
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha()]
    
    # Join lemmatized tokens back into a string
    return ' '.join(lemmatized_tokens)

# Apply preprocessing to the lyrics
df['processed_lyrics'] = df['lyrics'].apply(preprocess_lyrics)

# Initialize the TF-IDF Vectorizer for lyrics
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_lyrics'])



# Initialize VADER sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

# Function to calculate sentiment score for each song
def get_sentiment(lyrics):
    if lyrics:  # Ensure there are lyrics to analyze
        sentiment = sid.polarity_scores(lyrics)
        return sentiment['compound']  # Return the compound sentiment score (-1 to +1)
    return 0  # Return neutral if no lyrics available

# Apply sentiment analysis to each song's lyrics and store the sentiment score
df['sentiment'] = df['lyrics'].apply(get_sentiment)

# Helper function to extract base title from the full title
def extract_base_title(title):
    base_title = title.lower().split('(')[0].strip()  # Get text before any parentheses
    return base_title

# Helper function to calculate mood similarity
def mood_similarity(mood1, mood2):
    return 1 if mood1 == mood2 else 0
    

# Function to recommend similar songs based on lyrics and sentiment
def recommend_songs_based_on_lyrics_and_sentiment(song_name, num_recommendations=5, mood_weight=0.3, audio_weight=0.4):

    # Find the index of the song in the dataframe
    try:
        idx = df[df['track_name'].str.lower() == song_name.lower()].index[0]
    except IndexError:
        print(f"Song '{song_name}' not found in the dataset.")
        return
    
    # Get the mood and audio features of the input song
    input_song_mood = df.iloc[idx]['mood']
    input_song_audio_features = df.iloc[idx][audio_features].values.reshape(1, -1)
    
    # Compute cosine similarity for lyrics
    cosine_similarities_lyrics = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    
    # Compute cosine similarity for audio features
    cosine_similarities_audio = cosine_similarity(input_song_audio_features, df[audio_features]).flatten()

    # Extract sentiment of the input song
    input_song_sentiment = df.iloc[idx]['sentiment']
    
    # Extract the base title of the input song to exclude its versions
    base_title_input = extract_base_title(song_name)
    
    # Get indices of the top similar songs, excluding the input song and its versions
    similar_indices = cosine_similarities_lyrics.argsort()[::-1]
    
    # Filter out the input song's versions and duplicate base titles
    recommended_songs = []
    for i in similar_indices:
        if df.iloc[i]['track_name'].lower() == song_name.lower():
            continue  # Skip the input song itself
        
        base_title_recommendation = extract_base_title(df.iloc[i]['track_name'])
        
        # Exclude songs that have the same base title as the input song or previously recommended songs
        if base_title_recommendation != base_title_input and base_title_recommendation not in [extract_base_title(song['track_name']) for song, _ in recommended_songs]:
            # Calculate mood similarity
            mood_sim = mood_similarity(input_song_mood, df.iloc[i]['mood'])
            
            # Combine lyrics similarity, mood similarity, and audio similarity using weights
            combined_score = ((1 - mood_weight - audio_weight) * cosine_similarities_lyrics[i] +
                              mood_weight * mood_sim +
                              audio_weight * cosine_similarities_audio[i])
            sentiment_difference = np.abs(df.iloc[i]['sentiment'] - input_song_sentiment)
            
            # Only recommend songs with a similar sentiment (difference below a certain threshold)
            if sentiment_difference < 0.2: 
                recommended_songs.append((df.iloc[i], combined_score))
        
        # Stop when we have enough unique recommendations
        if len(recommended_songs) == num_recommendations:
            break
    
    # Sort the recommendations by combined score
    recommended_songs = sorted(recommended_songs, key=lambda x: x[1], reverse=True)
    
    # Print the top unique recommendations
    print(f"\nTop {num_recommendations} unique recommendations based on lyrics and sentiment for '{song_name}':\n")
    for song, score in recommended_songs:
        print(f"{song['track_name']} (Album: {song['album_name']}, Score: {score:.4f}, Mood: {song['mood']}, Sentiment: {song['sentiment']})")
        print("\n")
# Function to randomly pick a song from the dataset
def pick_random_song():
    random_index = random.randint(0, len(df) - 1)  # Pick a random index
    random_song = df.iloc[random_index]['track_name']
    print(f"\nRandomly picked song: {random_song}")
    return random_song

In [18]:
random_song = pick_random_song()
print(f"Mood of the song is {df[df['track_name'] == random_song]['mood'].values[0]}")
# Example: Recommend songs based on "Love Story"
recommend_songs_based_on_lyrics_and_sentiment(random_song, num_recommendations=5, mood_weight=0.5, audio_weight=0.5)


Randomly picked song: Sparks Fly
Mood of the song is Energetic

Top 5 unique recommendations based on lyrics and sentiment for 'Sparks Fly':

Beautiful Eyes - Live From Clear Channel Stripped 2008 (Album: Live From Clear Channel Stripped 2008, Score: 0.9384, Mood: Energetic, Sentiment: 0.9582)


Superman (Taylor’s Version) (Album: Speak Now (Taylor's Version), Score: 0.4940, Mood: Sad, Sentiment: 0.9953)


I Can See You (Taylor’s Version) (From The Vault) (Album: Speak Now (Taylor's Version), Score: 0.4779, Mood: Happy, Sentiment: 0.928)


I Did Something Bad (Album: reputation, Score: 0.4764, Mood: Sad, Sentiment: 0.9991)


Invisible (Album: Taylor Swift, Score: 0.3581, Mood: Sad, Sentiment: 0.99)


