In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-of-25k-songs-ordered/69.csv
/kaggle/input/selected-users/selected_users.json


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from tqdm import tqdm
import json

file_path = '/kaggle/input/data-of-25k-songs-ordered/69.csv'
data = pd.read_csv(file_path)

# Load the JSON file
json_file_path = '/kaggle/input/selected-users/selected_users.json'
with open(json_file_path, 'r') as f:
    user_data = json.load(f)


track_popularity = data['track_popularity']
min_value = data['track_popularity'].min()
max_value = data['track_popularity'].max()

In [None]:
# Define the similarity function using these extracted values
def absolute_difference_similarity(value1, value2, min_val, max_val):

    max_possible_difference = max_val - min_val
    difference = abs(value1 - value2)
    similarity = 1 - (difference / max_possible_difference)
    return similarity


def sentiment_similarity_combined(vector1, vector2, alpha_sentiment=0.5):

    cos_sim = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
    euc_dist = euclidean_distances(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
    max_dist = np.sqrt(len(vector1))
    euc_sim = 1 - (euc_dist / max_dist)
    combined_sim = alpha_sentiment * cos_sim + (1 - alpha_sentiment) * euc_sim
    return combined_sim

def lyrics_embedding_similarity(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]


def combined_audio_similarity(vector1, vector2, alpha_audio=0.5):

    cos_sim = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
    euc_dist = euclidean_distances(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
    max_dist = np.sqrt(len(vector1))
    euc_sim = 1 - (euc_dist / max_dist)
    combined_sim = alpha_audio * cos_sim + (1 - alpha_audio) * euc_sim
    return combined_sim

def lyrics_language_similarity(lang1, lang2):
    return 1 if lang1 == lang2 else 0

In [None]:
def calculate_similarity(data, index1, index2, weight_popularity=0.2, weight_audio=0.4, weight_language=0.1, weight_sentiment=0.2, weight_lyrics_embedding=0.1, alpha_audio=0.5, alpha_sentiment=0.5):
    song1 = data.iloc[index1]
    song2 = data.iloc[index2]

    popularity_sim = absolute_difference_similarity(song1['track_popularity'], song2['track_popularity'],min_val=min_value,max_val=max_value)

    audio_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    vector1 = song1[audio_features].values
    vector2 = song2[audio_features].values

    audio_sim = combined_audio_similarity(vector1.astype(float), vector2.astype(float), alpha_audio)

    language_sim = lyrics_language_similarity(song1['lyrics_language'], song2['lyrics_language'])

    sentiment_features = ['lyrics_joy', 'lyrics_sadness', 'lyrics_anger', 'lyrics_fear', 'lyrics_surprise', 'lyrics_disgust']
    sentiment_vector1 = song1[sentiment_features].values
    sentiment_vector2 = song2[sentiment_features].values
    sentiment_sim = sentiment_similarity_combined(sentiment_vector1.astype(float), sentiment_vector2.astype(float), alpha_sentiment)

    lyrics_embedding_features = [f'lyrics_lyric_embed_{i}' for i in range(768)]
    lyrics_embedding_vector1 = song1[lyrics_embedding_features].values
    lyrics_embedding_vector2 = song2[lyrics_embedding_features].values
    lyrics_embedding_sim = lyrics_embedding_similarity(lyrics_embedding_vector1.astype(float), lyrics_embedding_vector2.astype(float))

    overall_similarity = (weight_popularity * popularity_sim) + (weight_audio * audio_sim) + (weight_language * language_sim) + (weight_sentiment * sentiment_sim) + (weight_lyrics_embedding * lyrics_embedding_sim)
    return overall_similarity, popularity_sim, audio_sim, language_sim, sentiment_sim, lyrics_embedding_sim

In [None]:
def evaluate_playlist(playlist, data, threshold, weight_popularity, weight_audio, weight_language, weight_sentiment, weight_lyrics_embedding, alpha_audio, alpha_sentiment):
    last_song_index = data[data['song_id'] == playlist[-1]].index
    if last_song_index.empty:
        print(f"Skipping playlist because the last song with ID {playlist[-1]} was not found in the data.")
        return None, None  # Skip this playlist if the last song is not found

    last_song_index = last_song_index[0]
    similarities = []

    for song_id in playlist[:-1]:
        song_index = data[data['song_id'] == song_id].index
        if song_index.empty:
            continue  # Skip this song if it is not found
        song_index = song_index[0]
        overall_similarity, _, _, _, _, _ = calculate_similarity(
            data, last_song_index, song_index, weight_popularity, weight_audio, weight_language, weight_sentiment, weight_lyrics_embedding, alpha_audio, alpha_sentiment
        )
        similarities.append(overall_similarity)

    if not similarities:
        return None, None  # Return None if no valid comparisons were made

    avg_similarity = np.mean(similarities)
    accuracy = 1 if avg_similarity >= threshold else 0

    return avg_similarity, accuracy



## Define hyperparameter ranges
weight_popularity_values = [0.080]
weight_lyrics_embedding_values = [0.005]
weight_audio_values = [0.005]
weight_sentiment_values = [0.90]
weight_language_values = [0.010]

alpha_audio_values = [1]
alpha_sentiment_values = [0]

# Calculate the total number of possible combinations
total_combinations = 0
for weight_popularity in weight_popularity_values:
    for weight_lyrics_embedding in weight_lyrics_embedding_values:
        for weight_audio in weight_audio_values:
            for weight_sentiment in weight_sentiment_values:
                for weight_language in weight_language_values:
                    if weight_popularity + weight_lyrics_embedding + weight_audio + weight_sentiment + weight_language == 1:
                        for alpha_audio in alpha_audio_values:
                            for alpha_sentiment in alpha_sentiment_values:
                                total_combinations += 1

print(f"Total number of valid hyperparameter combinations: {total_combinations}")


# Initialize an empty DataFrame
final_results_df = pd.DataFrame(columns=[
    'weight_popularity', 'weight_audio', 'weight_language',
    'weight_sentiment', 'weight_lyrics_embedding',
    'alpha_audio', 'alpha_sentiment',
    'final_avg_similarity_score', 'final_overall_accuracy'
])

# Analyze each user's playlist for each combination of hyperparameters
results = []

combination_counter = 0
for weight_popularity in weight_popularity_values:
    for weight_lyrics_embedding in weight_lyrics_embedding_values:
        for weight_audio in weight_audio_values:
            for weight_sentiment in weight_sentiment_values:
                for weight_language in weight_language_values:
                    if weight_popularity + weight_lyrics_embedding + weight_audio + weight_sentiment + weight_language == 1:
                        for alpha_audio in alpha_audio_values:
                            for alpha_sentiment in alpha_sentiment_values:
                                combination_counter += 1
                                temp_results = []
                                print(f"Combination {combination_counter}/{total_combinations}: "
                                      f"weight_popularity={weight_popularity}, weight_audio={weight_audio}, "
                                      f"weight_language={weight_language}, weight_sentiment={weight_sentiment}, "
                                      f"weight_lyrics_embedding={weight_lyrics_embedding}, alpha_audio={alpha_audio}, "
                                      f"alpha_sentiment={alpha_sentiment}")
                                for user in tqdm(user_data, desc=f"Processing playlists for combination {combination_counter}/{total_combinations}", leave=False):
                                    user_id = user['UserID']
                                    playlist = user['SongIDs']

                                    avg_similarity, accuracy = evaluate_playlist(playlist, data, 0.7, weight_popularity, weight_audio, weight_language, weight_sentiment, weight_lyrics_embedding, alpha_audio, alpha_sentiment
)
                                    if avg_similarity is not None and accuracy is not None:
                                        temp_results.append({
                                            'UserID': user_id,
                                            'AverageSimilarityScore': avg_similarity,
                                            'Accuracy': accuracy
                                        })

                                if temp_results:
                                    temp_df = pd.DataFrame(temp_results)
                                    final_avg_similarity_score = temp_df['AverageSimilarityScore'].mean()
                                    final_overall_accuracy = temp_df['Accuracy'].sum() / len(temp_df)

                                    # Create a DataFrame for the current combination result
                                    current_result_df = pd.DataFrame([{
                                        'weight_popularity': weight_popularity,
                                        'weight_audio': weight_audio,
                                        'weight_language': weight_language,
                                        'weight_sentiment': weight_sentiment,
                                        'weight_lyrics_embedding': weight_lyrics_embedding,
                                        'alpha_audio': alpha_audio,
                                        'alpha_sentiment': alpha_sentiment,
                                        'final_avg_similarity_score': final_avg_similarity_score,
                                        'final_overall_accuracy': final_overall_accuracy
                                    }])

                                    # Append the current result DataFrame to the final results DataFrame
                                    final_results_df = pd.concat([final_results_df, current_result_df], ignore_index=True)

                                    # Optionally save the DataFrame to a file after each iteration to ensure progress is not lost
                                    final_results_df.to_csv('intermediate_results69.csv', index=False)

# Print the final results
print(final_results_df)

Total number of valid hyperparameter combinations: 1
Combination 1/1: weight_popularity=0.08, weight_audio=0.005, weight_language=0.01, weight_sentiment=0.9, weight_lyrics_embedding=0.005, alpha_audio=1, alpha_sentiment=0


                                                                                              

   weight_popularity  weight_audio  weight_language  weight_sentiment  \
0               0.08         0.005             0.01               0.9   

   weight_lyrics_embedding alpha_audio alpha_sentiment  \
0                    0.005           1               0   

   final_avg_similarity_score  final_overall_accuracy  
0                    0.736603                0.681538  


  final_results_df = pd.concat([final_results_df, current_result_df], ignore_index=True)


Given three songs , handpicked by us , the following functions recommends a song with highest custom similarity score defined above

In [None]:
def find_most_similar_song(data, index1, index2, index3, weight_popularity=0.025, weight_audio=0.005, weight_language=0.015, weight_sentiment=0.95, weight_lyrics_embedding=0.005, alpha_audio=1, alpha_sentiment=0):
    similarities = []

    # Get song names for the given indices
    song_name1 = data.iloc[index1]['song_title']
    song_name2 = data.iloc[index2]['song_title']
    song_name3 = data.iloc[index3]['song_title']

    # Calculate the similarity of each song in the dataset to the three given songs
    for i in tqdm(range(len(data)), desc="Calculating similarities"):
        if i in [index1, index2, index3]:
            continue  # Skip the given songs themselves

        sim1, _, _, _, _, _ = calculate_similarity(data, i, index1, weight_popularity, weight_audio, weight_language, weight_sentiment, weight_lyrics_embedding, alpha_audio, alpha_sentiment)
        sim2, _, _, _, _, _ = calculate_similarity(data, i, index2, weight_popularity, weight_audio, weight_language, weight_sentiment, weight_lyrics_embedding, alpha_audio, alpha_sentiment)
        sim3, _, _, _, _, _ = calculate_similarity(data, i, index3, weight_popularity, weight_audio, weight_language, weight_sentiment, weight_lyrics_embedding, alpha_audio, alpha_sentiment)

        overall_similarity = (sim1 + sim2 + sim3) / 3
        similarities.append((i, overall_similarity))

    # Sort the similarities in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Find the most similar song that does not have the same name as the original songs
    for most_similar_index, _ in similarities:
        most_similar_song = data.iloc[most_similar_index]
        most_similar_song_name = most_similar_song['song_title']

        if most_similar_song_name not in [song_name1, song_name2, song_name3]:
            most_similar_song_id = most_similar_song['song_id']
            most_similar_artist_name = most_similar_song['artist_name']
            return (most_similar_index, most_similar_song_id, most_similar_song_name, most_similar_artist_name,
                    song_name1, song_name2, song_name3)

    # If no suitable match is found, return None
    return None, None, None, None, song_name1, song_name2, song_name3

# Example usage
index1 = 12456  # Example index 1
index2 = 16984  # Example index 2
index3 = 15879 # Example index 3

(most_similar_index, most_similar_song_id, most_similar_song_name, most_similar_artist_name,
 song_name1, song_name2, song_name3) = find_most_similar_song(data, index1, index2, index3)

if most_similar_index is not None:
    print(f"The most similar song to the songs '{song_name1}', '{song_name2}', and '{song_name3}' is:\n")
    print(f"Index: {most_similar_index}\nSong ID: {most_similar_song_id}\nSong Name: {most_similar_song_name}\nArtist Name: {most_similar_artist_name}")
else:
    print(f"No suitable match found for the songs '{song_name1}', '{song_name2}', and '{song_name3}'.")