## Code Document

## Test Code

### Working on Spotify Playlist


Importing libraries and setting up authentication with Spotify API.

In [None]:
#Importing data from Spotify dataset

import spotipy
from spotipy.oauth2 import SpotifyOAuth

# Replace 'your_client_id', 'your_client_secret', and 'your_redirect_uri' with your actual values
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id='5093ad0ad45f435ca7ce43b2afd406b9',
    client_secret='f151962f414144aa85494be81ccd8e20',
    redirect_uri='http://localhost:8888/callback',
    scope="user-library-read playlist-read-private"))

Fetching data from Spotify

In [None]:
'''playlist_id = '6JOysTE0drK9yDi1Xs4FKy'  # Replace with the actual playlist ID
playlist = sp.playlist(playlist_id)
print(playlist['name'])
for item in playlist['tracks']['items']:
    track = item['track']
    print(track['name'], '-', track['artists'][0]['name'])'''

In [None]:
import pandas as pd

# Fetch saved tracks
playlist_id = '6JOysTE0drK9yDi1Xs4FKy'

results = sp.playlist_tracks(playlist_id)
tracks_data = {
    'Name': [],
    'Artist': [],
    'Album': [],
    'Release Date': []
}

for item in results['items']:
    track = item.get('track')
    if track:  # Check if track details are present
        name = track.get('name', 'No Title Available')
        artist_name = track['artists'][0].get('name', 'No Artist Available') if track['artists'] else 'No Artists'
        album_name = track['album'].get('name', 'No Album Available') if track['album'] else 'No Album'
        release_date = track['album'].get('release_date', 'No Release Date Available') if track['album'] else 'No Release Info'
        
        tracks_data['Name'].append(name)
        tracks_data['Artist'].append(artist_name)
        tracks_data['Album'].append(album_name)
        tracks_data['Release Date'].append(release_date)

# Convert to DataFrame
df_tracks = pd.DataFrame(tracks_data)
print(df_tracks)

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

# Initialize the Spotify client
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id='your_client_id',
    client_secret='your_client_secret',
    redirect_uri='http://localhost:8888/callback',
    scope='user-library-read'))

# Search for tracks by genre
genre = "rock"  # Example genre
results = sp.search(q='genre:' + genre, type='track', limit=50)
tracks_data = {
    'Name': [],
    'Artist': [],
    'Genre': genre,
    'Popularity': []
}

for track in results['tracks']['items']:
    tracks_data['Name'].append(track['name'])
    tracks_data['Artist'].append(track['artists'][0]['name'])
    tracks_data['Popularity'].append(track['popularity'])

# Print or process the results
print([tracks_data])

#### Importing Song Data with Lyrics using Genius API

In [None]:
import lyricsgenius

# Initialize Genius API
genius = lyricsgenius.Genius('put_your_token_here')

genius.remove_section_headers = True

# Spotify Global Top 50 playlist ID
playlist_id = '37i9dQZEVXbMDoHDwVN2tF'  # Example ID, update as necessary

results = sp.playlist_tracks(playlist_id)
tracks_data = []

# Fetch track details from Spotify
for item in results['items']:
    track = item['track']
    track_info = {
        'Name': track['name'],
        'Artist': track['artists'][0]['name'] if track['artists'] else 'Unknown',
        'Album': track['album']['name'] if track['album'] else 'Unknown',
        'Release Date': track['album']['release_date'] if track['album'] else 'Unknown',
        'Popularity': track['popularity']
    }
    # Fetch lyrics using Genius
    song = genius.search_song(track_info['Name'], track_info['Artist'])
    track_info['Lyrics'] = song.lyrics if song else 'Lyrics not found'
    
    tracks_data.append(track_info)

In [None]:
import pandas as pd

# Convert the data to a pandas DataFrame
df_tracks = pd.DataFrame(tracks_data)

# Set column names and display the DataFrame
df_tracks.columns = ['Track Name', 'Artist', 'Album', 'Release Date', 'Popularity', 'Lyrics']
print(df_tracks)

In [None]:
# Save DataFrame to CSV file
df_tracks.to_csv('spotify_songs_with_lyrics.csv', index=False)

In [None]:
df_tracks.columns

### Preprocessing data

Clean the lyrics by removing punctuation, converting to lowercase, and other typical text cleaning steps:

In [None]:
import re

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation
    text = text.lower()  # Convert to lower case
    return text

# Apply the cleaning function to the lyrics column
df_tracks['cleaned_lyrics'] = df_tracks['Lyrics'].apply(clean_text)

### Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_tracks['cleaned_lyrics'])

### Sentiment Analysis using VADER

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score
def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Apply the sentiment analysis function to the cleaned lyrics
df_tracks['sentiment_score'] = df_tracks['cleaned_lyrics'].apply(get_sentiment)

### Assign Sentiment Labels

Classifying the sentiments as positive, neutral or negative based on the computed score

In [None]:
def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df_tracks['sentiment_label'] = df_tracks['sentiment_score'].apply(assign_sentiment)

In [None]:
print(df_tracks[['Track Name', 'Artist', 'sentiment_score', 'sentiment_label']])

In [None]:
df_tracks[['Track Name', 'Artist', 'sentiment_score', 'sentiment_label']].to_csv('test_sentiment.csv', index=False)

test_sentiment = pd.read_csv('test_sentiment.csv')

### Importing Json file to Pandas readable format

Start by importing spotify challenge (AI Crowd) file into python.

In [None]:
import pandas as pd
import json

# Load JSON data from file
with open('/Users/rishabhhasija/Documents/neuefische/sentify_recommender/data/challenge_set.json', 'r') as file:
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

# Display the first few rows of the DataFrame to verify
#print(df_spotify.head())


#### Lyrics Code & Data

In [None]:
print(df_spotify.shape)
#print(df_spotify.head())

In [None]:
df_spotify.columns

In [None]:
df_spotify['playlist_num_tracks'].unique()

In [None]:
#df_spotify
#df_lyrics = df_spotify['track_uri'].unique()  
df_lyrics = df_spotify.drop_duplicates(subset=['track_uri']).drop(['playlist_name', 'playlist_pid', 'playlist_num_tracks', 'track_pos', 'artist_name',
       'artist_uri', 'track_name', 'album_uri', 'duration_ms', 'album_name'], axis=1).reset_index(drop=True)

In [None]:
df_lyrics.shape

In [None]:
mini_df = df_lyrics.head(47397)
mini_df.shape

In [None]:
df_lyrics.to_csv('df_lyrics.csv', index=False)

In [None]:
mini_df.to_csv('mini_df.csv', index=False)

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

auth_manager = SpotifyClientCredentials(client_id='59cdbf840d9245118927fc195c3e4e0a', client_secret='f3b0a7237fb04ca2ba9fc6bbefb729b7')
sp = spotipy.Spotify(auth_manager=auth_manager)

In [None]:
#import os
#def file_exists_and_not_empty(file_name):
#    return os.path.isfile(file_name) and os.stat(file_name).st_size > 0

In [None]:
def process_batch(batch_data):
    # Creating an empty DataFrame to store updated data
    processed_data = pd.DataFrame(columns=['track_uri', 'artist_name', 'track_name'])
    
    # Iterating over the batch_data to fetch details from Spotify
    for i, uri in enumerate(batch_data['track_uri']):
        try:
            track = sp.track(uri)
            processed_data.loc[i, 'track_uri'] = uri
            processed_data.loc[i, 'artist_name'] = track['artists'][0]['name']
            processed_data.loc[i, 'track_name'] = track['name']
            print(i)
        except Exception as e:
            print(f"Error processing {uri}: {e}")
    return processed_data

In [None]:
chunk_size = 10
csv_file = "mini_df.csv"  # Path to input CSV file
output_file = "mini_test.csv"  # Path to save the processed data

start_row = 46244 # Adjust row number from which processing needs to be started

# Read the CSV file in batches and process each batch
for chunk in pd.read_csv(csv_file, chunksize=chunk_size, skiprows=range(1, start_row)):
    time.sleep(5)
    processed_chunk = process_batch(chunk)
    # Append each processed chunk to the output file
    processed_chunk.to_csv(output_file, mode="a", header=False, index=False)

#### Removing Duplicates

In [None]:
# Reading dataset & assigning column names
df1 = pd.read_csv('mini_test.csv', header=None)
df1.columns = ['track_uri', 'artist_name', 'track_name']
df1.head()

In [None]:
#df1[df1.duplicated(keep=False)]
df1_unique = df1.drop_duplicates(keep='first')
df1_unique.shape

In [None]:
data1 = ['df1', 'df2']

#data2 = [pd.read_csv(f) for f in data1]
spotify_data1 = pd.concat([df1, df2], ignore_index=True)
spotify_data1.shape

In [None]:
spotify_data = spotify_data1.drop_duplicates(keep='first')
#spotify_data.shape
spotify_data.to_csv('spotify_data.csv', index=False)


### Testing on Genius

In [None]:
import pandas as pd
spotify_data = pd.read_csv('/Users/rishabhhasija/Documents/neuefische/sentify_recommender/clean_spotify_data.csv', index_col=None)
df = spotify_data.iloc[43999:]
df = df.reset_index(drop=True)
df.head()

In [None]:
# Calling for Genius API credentials from dotenv file
from dotenv import load_dotenv
import os
import lyricsgenius
import pandas as pd

load_dotenv()
token = os.getenv("genius_token")
genius = lyricsgenius.Genius(token) # Providing token info to access information 
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.timeout = 60 # Timeout call for API

#### Try 2

In [None]:
import lyricsgenius
import time

genius = lyricsgenius.Genius('your_genius_api_token')
genius.remove_section_headers = True

# Defining the batches at which to save progress
batch_size = 10
num_songs = len(df)

# Specify the starting index from where to resume processing
start_index = int(input("input value here"))

for i in range(start_index, num_songs, batch_size):
    end_index = min(i + batch_size, num_songs)

    for j in range(i, end_index):
        try:
            song = genius.search_song(df.loc[j, 'track_name'], df.loc[j, 'artist_name'])
            df.loc[j, 'lyrics'] = song.lyrics.replace('\n', ' ') if song else 'Lyrics not found'
        except Exception as e:
            print(f"Error fetching lyrics for index {j}: {e}")
            df.loc[j, 'lyrics'] = 'Error fetching lyrics'

    # Save the DataFrame at regular intervals
    df.to_csv('lyrics_rishabh.csv', index=False)
    print(f"Progress saved up to index {end_index - 1}")

    #time.sleep(10)

In [None]:
import pandas as pd
import lyricsgenius
import os

# Initialize the Genius API
genius = lyricsgenius.Genius('g6Ycf22ZvPBoLt_IARCcp8_DUVzU0EHrGdgOKV_v9yqFutnG2F3HBoR1UJcoFDal')
genius.remove_section_headers = True

# Load your data
# Assuming 'original_data.csv' is your dataset that needs lyrics fetching
if os.path.exists('lyrics_rishabh.csv'):
    df = pd.read_csv('lyrics_rishabh.csv')
else:
    df = pd.read_csv('original_data.csv')

# Define batch size
batch_size = 10
num_songs = len(df)

# Process in batches
for i in range(0, num_songs, batch_size):
    end_index = min(i + batch_size, num_songs)

    for j in range(i, end_index):
        # Fetch lyrics only if they are not already present
        if pd.isna(df.loc[j, 'lyrics']) or df.loc[j, 'lyrics'] in ['Lyrics not found', 'Error fetching lyrics']:
            try:
                song = genius.search_song(df.loc[j, 'track_name'], df.loc[j, 'artist_name'])
                df.loc[j, 'lyrics'] = song.lyrics.replace('\n', ' ') if song else 'Lyrics not found'
            except Exception as e:
                print(f"Error fetching lyrics for index {j}: {e}")
                df.loc[j, 'lyrics'] = 'Error fetching lyrics'

    # Save the DataFrame at regular intervals
    df.to_csv('lyrics_rishabh.csv', index=False)
    print(f"Progress saved up to index {end_index - 1}")

# Ensure the final save after completion
df.to_csv('lyrics_rishabh.csv', index=False)


In [None]:
for i in range(len(spotify_data)-1):
    song = genius.search_song(spotify_data['track_name'][i], spotify_data['artist_name'][i])
    spotify_data.loc[i, 'lyrics'] = song.lyrics if song else 'Lyrics not found'

In [None]:
# Function to fetch artist name and track name
def fetch_tracks_details(track_uris):
    try:
        tracks_info = sp.tracks(track_uris)
        results = []
        for track in tracks_info['tracks']:
            if track:  # Check if track details are successfully retrieved
                artist_name = track['artists'][0]['name'] if track['artists'] else 'Unknown'
                track_name = track['name'] if track else 'Unknown'
                results.append((artist_name, track_name))
            else:
                results.append(('Unknown', 'Unknown'))
        return results
    except Exception as e:
        print(f"Error processing batch: {str(e)}")
        return [('Unknown', 'Unknown')] * len(track_uris)

# Apply batch processing
batch_size = 50
artist_names = []
track_names = []

for i in range(0, len(df_lyrics['track_uri']), batch_size):
    batch_uris = df_lyrics['track_uri'][i:i + batch_size].tolist()
    batch_results = fetch_tracks_details(batch_uris)
    artist_names.extend([res[0] for res in batch_results])
    track_names.extend([res[1] for res in batch_results])

df_lyrics['alb_name'] = artist_names
df_lyrics['trk_name'] = track_names

# Save the updated DataFrame
df_lyrics.to_csv('df_lyrics_updated.csv', index=False)

#### Splitting File into multiple files of fixed row size

In [None]:
rows_per_file = 3000  # Number of rows per file

# Calculate how many files will be needed
num_files = len(df_lyrics) // rows_per_file + (1 if len(df_lyrics) % rows_per_file != 0 else 0)

# Split and save the DataFrame into multiple CSV files
for i in range(num_files):
    start_row = i * rows_per_file
    end_row = start_row + rows_per_file
    # Create a new CSV file for each chunk
    df_lyrics[start_row:end_row].to_csv(f'output_file_{i + 1}.csv', index=False)

#### Fetching Lyrics from Genius API

In [None]:
import dotenv
from dotenv import load_dotenv
import os
import lyricsgenius

load_dotenv()
token = os.getenv("genius_token")
genius = lyricsgenius.Genius(token)
genius.remove_section_headers = True # Remove section headers from lyrics when searching
genius.timeout = 60

In [None]:
# Fetching lyrics via API
for i in range(len(df_lyrics)-66233):
    df_lyrics.loc[i, 'lyrics'] = genius.search_song(df_lyrics['track_name'][i], df_lyrics['artist_name'][i])

In [None]:
for i in range(len(df_lyrics)-1):
    song = genius.search_song(df_lyrics['track_name'][i], df_lyrics['artist_name'][i])
    df_lyrics.loc[i, 'lyrics'] = song.lyrics if song else 'Lyrics not found'

In [None]:
df_lyrics['lyrics'][7]

In [None]:
print("131 ContributorsTranslationsEspañolFrançaisItalianoPortuguêsTeam Lyrics\nWait till you're announced\nWe've not yet lost all our graces\nThe hounds will stay in chains\nLook upon Your Greatness and she'll\nSend the call out, send the call out\nSend the call out, send the call out\nSend the call out, send the call out\nSend the call out, send the call out\nSend the call out, send the call out\nSend the call out, send the call out\nSend the call out, send the call out\nSend the call out, send the call out\n\nCall all the ladies out\nThey're in their finery\nA hundred jewels on throats\nA hundred jewels between teeth\nNow bring my boys in\nTheir skin in craters like the moon\nThe moon we love like a brother\nWhile he glows through the room\n\nDancin' around the lies we tell\nDancin' around big eyes, as well\nEven the comatose\nThey don't dance and tell\nYou might also like\nWe live in cities you'll never see on-screen\nNot very pretty, but we sure know how to run things\nLivin' in ruins of a palace within my dreams\nAnd you know, we're on each other's team\n\nI'm kind of over gettin' told to throw my hands up in the air\nSo there\n\nSo all the cups got broke\nShards beneath our feet\nBut it wasn't my fault\nAnd everyone's competing\nFor a love they won't receive\n'Cause what this palace wants is release\n\nWe live in cities you'll never see on-screen\nNot very pretty, but we sure know how to run things\nLivin' in ruins of a palace within my dreams\nAnd you know, we're on each other's team\n\nI'm kind of over gettin' told to throw my hands up in the air\nSo there\nI'm kind of older than I was when I reveled without a care\nSo there\nWe live in cities you'll never see on-screen\nNot very pretty, but we sure know how to run things\nLivin' in ruins of a palace within my dreams\nAnd you know, we're on each other's team\n\nWe're on each other's team\nAnd you know, we're on each other's team\nWe're on each other's team\nAnd you know, and you know, and you know330Embed")

In [None]:
df_lyrics

In [None]:
df_lyrics['lyrics'].isna().value_counts()

In [None]:
import lyricsgenius

# Initialize Genius API
genius = lyricsgenius.Genius('put_your_token_here')

genius.remove_section_headers = True

# Spotify Global Top 50 playlist ID
playlist_id = '37i9dQZEVXbMDoHDwVN2tF'  # Example ID, update as necessary

results = sp.playlist_tracks(playlist_id)
tracks_data = []

# Fetch track details from Spotify
for item in results['items']:
    track = item['track']
    track_info = {
        'Name': track['name'],
        'Artist': track['artists'][0]['name'] if track['artists'] else 'Unknown',
        'Album': track['album']['name'] if track['album'] else 'Unknown',
        'Release Date': track['album']['release_date'] if track['album'] else 'Unknown',
        'Popularity': track['popularity']
    }
    # Fetch lyrics using Genius
    song = genius.search_song(track_info['Name'], track_info['Artist'])
    track_info['Lyrics'] = song.lyrics if song else 'Lyrics not found'
    
    tracks_data.append(track_info)

In [None]:
# Initialize the lyrics column if it doesn't exist
if 'lyrics' not in df_lyrics.columns:
    df_lyrics['lyrics'] = None

# Defining batch size for progress tracking
batch_size = 100
num_batches = (len(df_lyrics) + batch_size - 1) // batch_size  # Calculate total number of batches

# Processing each song in df_lyrics
for i in range(len(df_lyrics)):
    if pd.isna(df_lyrics.at[i, 'lyrics']):  # Check if lyrics already fetched to avoid refetching
        try:
            song = genius.search_song(df_lyrics.at[i, 'track_name'], df_lyrics.at[i, 'artist_name'])
            df_lyrics.at[i, 'lyrics'] = song.lyrics if song else 'Lyrics not found'
        except Exception as e:
            print(f"Error fetching lyrics for row {i}: {e}")
            df_lyrics.at[i, 'lyrics'] = 'Error fetching lyrics'
    # Checking print progress
    if i % batch_size == 0 or i == len(df_lyrics) - 1:
        print(f"Processed {i+1}/{len(df_lyrics)} songs, Batch {i // batch_size + 1}/{num_batches}")

# Saving complete DataFrame with lyrics to a new file
df_lyrics.to_csv('df_final.csv', index=False)


In [None]:
# Save DataFrame to CSV file
df_lyrics.to_csv('spotify_lyrics.csv', index=False)

In [None]:
df_lyrics.head()

In [None]:
'''import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Initialize spotipy with Spotify API credentials
client_id = '5093ad0ad45f435ca7ce43b2afd406b9'  # Replace with your client ID
client_secret = 'f151962f414144aa85494be81ccd8e20'  # Replace with your client secret
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)'''

# Function to fetch artist name and track name
def fetch_tracks_details(track_uris):
    try:
        tracks_info = sp.tracks(track_uris)
        results = []
        for track in tracks_info['tracks']:
            if track:  # Check if track details are successfully retrieved
                artist_name = track['artists'][0]['name'] if track['artists'] else 'Unknown'
                track_name = track['name'] if track else 'Unknown'
                results.append((artist_name, track_name))
            else:
                results.append(('Unknown', 'Unknown'))
        return results
    except Exception as e:
        print(f"Error processing batch: {str(e)}")
        return [('Unknown', 'Unknown')] * len(track_uris)

# Apply batch processing
batch_size = 50
artist_names = []
track_names = []

for i in range(0, len(df_lyrics['track_uri']), batch_size):
    batch_uris = df_lyrics['track_uri'][i:i + batch_size].tolist()
    batch_results = fetch_tracks_details(batch_uris)
    artist_names.extend([res[0] for res in batch_results])
    track_names.extend([res[1] for res in batch_results])

df_lyrics['alb_name'] = artist_names
df_lyrics['trk_name'] = track_names

# Save the updated DataFrame
df_lyrics.to_csv('df_lyrics_updated.csv', index=False)

### Using Cosine Similarity for Recommendation

In [None]:
df_spotify_features = df_spotify.drop(columns=['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_name', 'album_name'])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from chunkdot import CosineSimilarityTopK

numeric_features = ['playlist_pid', 'track_pos', 'duration_ms']
categorical_features = ['track_uri', 'artist_uri', 'album_uri']

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

pipe = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])

In [None]:
pipe

In [None]:
test = pipe.fit_transform(df_spotify_features)

In [None]:
type(test)
# Convert csr.matrix to Dataframe
#test_df = pd.DataFrame.sparse.from_spmatrix(test)
#df_spotify

In [None]:
# Build index with Track names
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get Track recommendations based on the cosine similarity 
def track_recommendations(track, num_recommendations=300):
    #get the index of the Track we put into the function
    idx = indices.get(track)
    #calculate all cosine similarities to that Track and store it in a list
    sim_scores = list(enumerate(test[idx].toarray().flatten()))
    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #determine the number of recommendations to retrieve
    available_recommendations = min(num_recommendations, len(sim_scores) - 1)
    #get the similarities from 1 up to 500 recommendations + 1(not starting with 0 because it is the same Track)
    sim_scores = sim_scores[1:available_recommendations + 1]
    #get the indices of that 10 Track
    track_indices = [i[0] for i in sim_scores]
    #retrieve remaining recommendations if not enough unique songs are found
    if len(track_indices) < num_recommendations:
        all_indices = set(range(test.shape[0]))
        used_indices = set(track_indices + [idx])
        remaining_indices = list(all_indices - used_indices)

        # Fill up the missing slots with any remaining indices
        track_indices.extend(remaining_indices[:num_recommendations - len(track_indices)])

    #ensuring indices are within bounds
    track_indices = [idx for idx in track_indices if idx < len(track_uri)]

    #adding extra recommendations if <500
    while len(track_indices)<num_recommendations:
        track_indices.extend(track_indices[:num_recommendations - len(track_indices)])

    unique_track_indices = list(dict.fromkeys(track_indices))

    #return the Track names of that 10 Track
    return track_uri.iloc[unique_track_indices[:num_recommendations]]

In [None]:
track_recommendations('spotify:track:66U0ASk1VHZsqIkpMjKX3B')

## Sentiment Classifier

In [1]:
# Importing libraries
import pandas as pd
import json
import numpy as np
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK

In [2]:
df = pd.read_csv('../data/lyrics.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['lyrics'])

In [4]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score
def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Apply the sentiment analysis function to the cleaned lyrics
df['sentiment_score'] = df['lyrics'].apply(get_sentiment)

In [5]:
def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df['sentiment_label'] = df['sentiment_score'].apply(assign_sentiment)

In [6]:
df = df.drop(columns=['lyrics'])

In [7]:
df.to_csv('../data/sentiment_data.csv', index=False)

## Sentiment Analysis

In [2]:
# Load JSON data from file
with open('../data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

In [3]:
df_spotify = df_spotify.drop(columns=['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_name', 'album_name'])

In [4]:
df_sentiment = pd.read_csv('../data/sentiment_data.csv')

In [5]:
df_spotify = df_spotify.merge(df_sentiment, on='track_uri', how='left')

In [6]:
df_spotify

Unnamed: 0,playlist_pid,track_pos,track_uri,artist_uri,album_uri,duration_ms,sentiment_score,sentiment_label
0,1000000,0,spotify:track:66U0ASk1VHZsqIkpMjKX3B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,163809,0.8633,positive
1,1000000,1,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:1qHVYbxQ6IS8YRviorKDJI,166848,0.7938,positive
2,1000000,2,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:4UEPxQx0cTcYNsE0n32MHV,232506,0.9830,positive
3,1000000,3,spotify:track:35kahykNu00FPysz3C2euR,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,216600,0.9978,positive
4,1000000,4,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,193058,0.8695,positive
...,...,...,...,...,...,...,...,...
280995,1006767,0,spotify:track:38griAVM808crjbFp9gcPD,spotify:artist:6nnspeopmJAG07xOxHmqTu,spotify:album:2QeEEn8jNy5SFx9coIzS3Z,339573,-0.9905,negative
280996,1006771,0,spotify:track:1JClFT74TYSXlzpagbmj0S,spotify:artist:1ZwdS5xdxEREPySFridCfh,spotify:album:3PO9OtQdvCDJN8zDLtZiYd,285026,,
280997,1006773,0,spotify:track:4InLm5a9Qtkru6YxEjM4Qc,spotify:artist:2Y9lO01ABSO8OkBU8FI1mp,spotify:album:5NjFyeZJkYAh5ri9eh8ZSO,279322,0.9956,positive
280998,1006775,0,spotify:track:4hdog9vyyqG9pcppG2Izek,spotify:artist:2cFrymmkijnjDg9SS92EPM,spotify:album:1TkwzY3l4LqAfrQwBAx45Q,223295,-0.9581,negative


In [7]:
df_spotify.dropna(inplace=True)

In [8]:
df_spotify[df_spotify['sentiment_score'].isna()]

Unnamed: 0,playlist_pid,track_pos,track_uri,artist_uri,album_uri,duration_ms,sentiment_score,sentiment_label


In [9]:
df_spotify.shape

(258678, 8)

In [8]:
#Drop the Label, keep sentiment score
df_spotify.drop(columns=['sentiment_label'])

Unnamed: 0,playlist_pid,track_pos,track_uri,artist_uri,album_uri,duration_ms,sentiment_score
0,1000000,0,spotify:track:66U0ASk1VHZsqIkpMjKX3B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,163809,0.8633
1,1000000,1,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:1qHVYbxQ6IS8YRviorKDJI,166848,0.7938
2,1000000,2,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:4UEPxQx0cTcYNsE0n32MHV,232506,0.9830
3,1000000,3,spotify:track:35kahykNu00FPysz3C2euR,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,216600,0.9978
4,1000000,4,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,193058,0.8695
...,...,...,...,...,...,...,...
280994,1006752,0,spotify:track:6FI3RJ58Ztl0X1VtA6pVs9,spotify:artist:09hVIj6vWgoCDtT03h8ZCa,spotify:album:4v5x3Oo3UjQ9YmF3hRAip5,208840,0.9996
280995,1006767,0,spotify:track:38griAVM808crjbFp9gcPD,spotify:artist:6nnspeopmJAG07xOxHmqTu,spotify:album:2QeEEn8jNy5SFx9coIzS3Z,339573,-0.9905
280997,1006773,0,spotify:track:4InLm5a9Qtkru6YxEjM4Qc,spotify:artist:2Y9lO01ABSO8OkBU8FI1mp,spotify:album:5NjFyeZJkYAh5ri9eh8ZSO,279322,0.9956
280998,1006775,0,spotify:track:4hdog9vyyqG9pcppG2Izek,spotify:artist:2cFrymmkijnjDg9SS92EPM,spotify:album:1TkwzY3l4LqAfrQwBAx45Q,223295,-0.9581


In [9]:
numeric_features = ['playlist_pid', 'track_pos', 'sentiment_score']
categorical_features = ['track_uri', 'artist_uri', 'album_uri']

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])

In [10]:
cos_sim_pipeline

In [11]:
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify)

In [12]:
# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)

In [13]:
# Build index with track uris
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    """#catch duplicates
    # duplicates = indices[track].iloc[1:]
    # dup_list = list(enumerate(sim_matrix_df[duplicates]))"""

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    """#remove duplicates from recommendation list
    remove = []
    for tup in sim_scores:
        for index in dup_list:
            if tup[0] == index[1]:
                remove.append(tup) 
    print(sorted(remove, key=lambda x: x[1], reverse=True)) #Test

    sim_scores = [x for x in sim_scores if x not in remove] """

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    """# get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up """
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:100]

    #get the indeces of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[1:4]
    return recommended_tracks.to_list()

In [14]:
track_list = ['spotify:track:0uppYCG86ajpV2hSR3dJJ0', 'spotify:track:5MhsZlmKJG6X5kTHkdwC4B']
recommended = []
for track in track_list:
    #print(track)
    x = track_recommendations(track)
    for i in x:
        recommended.append(i)

In [None]:
recommended

#### Using Joblib

In [15]:
import joblib

# Save the preprocessor pipeline
joblib.dump(preprocessor, 'preprocessor_pipeline.pkl')

# Save the similarity matrix
joblib.dump(sim_matrix_df, 'similarity_matrix.pkl')

# Save the track_uri and indices
df_spotify[['track_uri']].to_pickle('track_uri.pkl')
indices.to_pickle('indices.pkl')

#### Ignore

In [154]:
track_list = ['spotify:track:2UYJqglnOMTvRcqQLNcjjf', 'spotify:track:5MhsZlmKJG6X5kTHkdwC4B', 'spotify:track:38griAVM808crjbFp9gcPD']
recommended = []
for track in track_list:
    recommended.append(track_recommendations(track, 3))
    print(recommended)

[255      spotify:track:2J1t2b8xPBqZzj5znx0C7C
31278    spotify:track:4YmJGDRo6oGhObb7MaW5om
1300     spotify:track:4JLojdKkKNM3rgvP2zvUGR
Name: track_uri, dtype: object]
[255      spotify:track:2J1t2b8xPBqZzj5znx0C7C
31278    spotify:track:4YmJGDRo6oGhObb7MaW5om
1300     spotify:track:4JLojdKkKNM3rgvP2zvUGR
Name: track_uri, dtype: object, 16702    spotify:track:5MhsZlmKJG6X5kTHkdwC4B
Name: track_uri, dtype: object]


AttributeError: 'numpy.int64' object has no attribute 'iloc'

In [92]:
d1 = track_recommendations('spotify:track:2UYJqglnOMTvRcqQLNcjjf', 10)

In [93]:
d1_list = d1.to_list()

#### Incorrect Track Recommendations

In [145]:
def track_recommendations(tracks, top_n=10):
    # Initialize an empty list to store the track indices
    track_indices = []

    # Iterate over each track in the list
    for track in tracks:
        # Get the index of the track
        idx = indices.loc[track]

        # Check if the track index is present in the similarity matrix
        if idx in sim_matrix_df.index:
            # Calculate the cosine similarity scores for the track
            sim_scores = list(enumerate(sim_matrix_df.loc[idx]))

            # Sort the scores in descending order
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

            # Add the track indices to the list
            track_indices.extend([i[0] for i in sim_scores[1:top_n+1]])

    # Calculate the average cosine similarity for each track index
    avg_sim_scores = {}
    for idx in track_indices:
        avg_sim_scores[idx] = avg_sim_scores.get(idx, 0) + 1

    # Sort the track indices by their average similarity score
    sorted_indices = sorted(avg_sim_scores, key=avg_sim_scores.get, reverse=True)

    # Get the top N recommended tracks
    recommended_tracks = track_uri.iloc[sorted_indices[:top_n]]

    return recommended_tracks.tolist()

In [146]:
recommend = track_recommendations(['spotify:track:2UYJqglnOMTvRcqQLNcjjf', 'spotify:track:5MhsZlmKJG6X5kTHkdwC4B', 'spotify:track:38griAVM808crjbFp9gcPD'])
print(recommend)

TypeError: unhashable type: 'Series'

In [147]:
# New function to get recommendations for 3 songs

import numpy as np

def track_recommendations(track_uris, top_n=10):
    # Ensure input is in list format even if a single track URI is passed
    if isinstance(track_uris, str):
        track_uris = [track_uris]

    # Initialize sum of similarity scores
    sim_scores_sum = None

    # Loop through each track URI provided
    for track in track_uris:
        idx = indices[track]  # Directly get the index of the track
        if sim_scores_sum is None:
            sim_scores_sum = sim_matrix_df.iloc[idx].to_numpy(copy=True)  # Start with the first track's similarity scores
        else:
            sim_scores_sum += sim_matrix_df.iloc[idx].to_numpy()  # Add the similarity scores of subsequent tracks

    # Convert the accumulated numpy array back to pandas Series to use pandas' nlargest function
    avg_sim_scores = pd.Series(sim_scores_sum / len(track_uris), index=sim_matrix_df.columns)

    # Get the indices of the top N similar tracks
    top_indices = avg_sim_scores.nlargest(top_n + 1).index  # Get top N+1 indices to avoid self-recommendation

    # Get the recommended tracks, skipping any that are in the input list
    recommended_tracks = track_uri.iloc[top_indices].drop_duplicates().iloc[1:top_n+1]  # Skip the highest if it's an input track

    return recommended_tracks

In [148]:
track_list = ['spotify:track:2UYJqglnOMTvRcqQLNcjjf', 'spotify:track:5MhsZlmKJG6X5kTHkdwC4B', 'spotify:track:38griAVM808crjbFp9gcPD']
recommended = track_recommendations(track_list, 10)
print(recommended)

IndexError: positional indexers are out-of-bounds

#### Test Data

In [50]:
lorde_list = lorde.to_list()

In [51]:
df_lorde = lorde.to_frame()

In [None]:
df_lorde

In [24]:
df_lorde[df_lorde['track_uri'] == 'spotify:track:7yyRTcZmCiyzzJlNzGC9Ol']

"df_lorde[df_lorde['track_uri'] == 'spotify:track:7yyRTcZmCiyzzJlNzGC9Ol']"

### Create Playlist on Spotify

In [86]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import os

class CreatePlaylist:
    """
    A class for accessing your spotify account via the API using spotipy.
    Make sure to have a .env file prepared with your client id and client secret.
    Above you find an example use for using this class and the methods.

    Example usage:
    spotify_api = CreatePlaylist()
    my_playlist = spotify_api.create_playlist(name="TestAutomaticPlaylistGeneration", description="Automatically generated playlist")
    test_uri = ['spotify:track:7yyRTcZmCiyzzJlNzGC9Ol']
    spotify_api.add_tracks_to_playlist(my_playlist['id'], test_uri)
    """
    def __init__(self):
        load_dotenv()
        self.sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
            client_id=os.getenv("spotify_client_id"),
            client_secret=os.getenv("spotify_client_secret"),
            redirect_uri="http://localhost:8888/callback",
            scope='user-library-read playlist-modify-private',
            cache_path="token.txt"))

    def create_playlist(self, name, description, public=False):
        """ 
        A method from spotipy that creates an empty Spotify playlist for the account signed in via the API.
        It takes the playlist name and description and additional features (public and user_id)
        """
        results = self.sp.current_user()
        user_id = results['id']
        my_playlist = self.sp.user_playlist_create(user=f"{user_id}", name=name, public=public, description=description)
        return my_playlist
        

    def add_tracks_to_playlist(self, playlist_id, track_uris, position=None):
        """ 
        A method from spotipy that adds track to a Spotify playlist.
        Takes the playlist_id and a list of track URIs as arguments (and optionally the position of a track)
        """
        self.sp.playlist_add_items(playlist_id=playlist_id, items=track_uris, position=position)

In [94]:
spotify_api = CreatePlaylist()
my_playlist = spotify_api.create_playlist(name="TestAutomaticPlaylistGeneration", description="Automatically generated playlist")
test_uri = d1_list
spotify_api.add_tracks_to_playlist(my_playlist['id'], test_uri)


	from spotipy.oauth2 import CacheFileHandler
	handler = CacheFileHandler(cache_path=cache_path, username=username)
	sp = spotipy.SpotifyOAuth(client_id, client_secret, redirect_uri, cache_handler=handler)


### Evaluation Metrics - Not to be Used

In [123]:
# Creating Precision & Recall for Evaluation

def precision_at_k(actual, predicted, k):
    # Convert predicted to set and limit to top k items
    pred_set = set(predicted[:k])
    # Convert actual to set
    actual_set = set(actual)
    # Calculate intersection of predicted and actual
    relevant_and_recommended = pred_set.intersection(actual_set)
    # Return the proportion of relevant items in the top k recommendations
    return len(relevant_and_recommended) / float(k)

def recall_at_k(actual, predicted, k):
    # Convert predicted to set and limit to top k items
    pred_set = set(predicted[:k])
    # Convert actual to set
    actual_set = set(actual)
    # Calculate intersection of predicted and actual
    relevant_and_recommended = pred_set.intersection(actual_set)
    # Return the proportion of relevant items that are actually recommended
    return len(relevant_and_recommended) / float(len(actual_set))

In [143]:
# Identify playlists that contain test 'track_uri'
playlists_with_track = df_spotify[df_spotify['track_uri'] == 'spotify:track:0GZoB8h0kqXn7XFm4Sj06k']['playlist_pid'].unique()

# Collecting all tracks from these playlists
actual_tracks = df_spotify[df_spotify['playlist_pid'].isin(playlists_with_track)]['track_uri'].unique().tolist()

# Ensure that 'some_track_uri' is not in the actual tracks to avoid self-recommendation
actual_tracks.remove('spotify:track:0GZoB8h0kqXn7XFm4Sj06k')

recommended_tracks = track_recommendations('spotify:track:0GZoB8h0kqXn7XFm4Sj06k', 100)

In [144]:
# Calculate precision and recall at k=10
precision = precision_at_k(actual_tracks, recommended_tracks, 10)
recall = recall_at_k(actual_tracks, recommended_tracks, 10)

print(f"Precision: {precision}")
print(f"Recall: {recall}")

Precision: 0.6
Recall: 0.007255139056831923


### Testing model creation using Association Rules

In [126]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Gathering all tracks for each playlist, resulting in a list of playlists, with each playlist a list of tracks
playlists = df_spotify.groupby('playlist_pid')['track_uri'].apply(list).tolist()

# Initialize the transaction encoder to transform lists of tracks into a one-hot encoded format
te = TransactionEncoder()
te_ary = te.fit(playlists).transform(playlists)
df = pd.DataFrame(te_ary, columns=te.columns_)  # DataFrame to hold the binary attributes

# Generate frequent item sets that have a support of at least 50%
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

# Generate association rules from the frequent item sets with a confidence threshold of 70%
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.001)

# Print the resulting rules with their antecedents, consequents, support, and confidence
print(rules[['antecedents', 'consequents', 'support', 'confidence']])


ValueError: The input DataFrame `df` containing the frequent itemsets is empty.

#### Apriori Result: FAILED

### Testing using KMeans

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Assuming df_spotify is your DataFrame and it includes 'duration_ms' and 'sentiment_score'
features = df_spotify[['playlist_pid', 'sentiment_score']]#, 'track_uri', 'artist_uri', 'album_uri']]

# Scale the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Initialize KMeans with a guessed number of clusters (e.g., 5)
kmeans = KMeans(n_clusters=20, random_state=0)
df_spotify['cluster'] = kmeans.fit_predict(features_scaled)

# Check the count of tracks in each cluster
cluster_counts = df_spotify['cluster'].value_counts()
print(cluster_counts)

# Optional: Analyze characteristics of each cluster
for i in range(5):
    cluster_mean = df_spotify[df_spotify['cluster'] == i][['playlist_pid', 'sentiment_score']]#, 'track_uri', 'artist_uri', 'album_uri']].mean()
    print(f"Cluster {i} mean duration and sentiment score:", cluster_mean);


#### Approach 2: Creating a single feature for track, artist, album uri's

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Combining track, artist, and album URIs into a single 'feature' for each playlist
df_spotify['combined'] = df_spotify['track_uri'] + ' ' + df_spotify['artist_uri'] + ' ' + df_spotify['album_uri']

# Create matrix of combined
vectorizer = CountVectorizer(token_pattern=r'[^\s]+')
playlist_features = vectorizer.fit_transform(df_spotify.groupby('playlist_pid')['combined'].apply(' '.join))

# Adding the mean of sentiment score as a feature
sentiment_scores = df_spotify.groupby('playlist_pid')['sentiment_score'].mean().values
# Scaling the features
scaler = StandardScaler()
playlist_features_scaled = scaler.fit_transform(playlist_features.toarray())

# Adding sentiment score as a feature
import numpy as np
playlist_features_final = np.hstack([playlist_features_scaled, sentiment_scores.reshape(-1, 1)])

# Cluster playlists
kmeans = KMeans(n_clusters=5, random_state=0)
playlist_clusters = kmeans.fit_predict(playlist_features_final)

# Map clusters back to original data
df_spotify['playlist_cluster'] = df_spotify['playlist_pid'].map(dict(zip(df_spotify['playlist_pid'].unique(), playlist_clusters)))

# Check the distribution of playlists across clusters
print(df_spotify['playlist_cluster'].value_counts())



playlist_cluster
0    258488
4        98
2        90
3         1
1         1
Name: count, dtype: int64


## Model Import

In [16]:
# Build index with track identifiers
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track, top_n=200):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    #catch duplicates
    # duplicates = indices[track].iloc[1:]
    # dup_list = list(enumerate(sim_matrix_df[duplicates]))

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    #remove duplicates from recommendation list
    """ remove = []
    for tup in sim_scores:
        for index in dup_list:
            if tup[0] == index[1]:
                remove.append(tup) 
    print(sorted(remove, key=lambda x: x[1], reverse=True)) #Test

    sim_scores = [x for x in sim_scores if x not in remove] """

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    # get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up 
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:top_n+1]

    #get the indices of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[:top_n]
    return recommended_tracks