In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [57]:
def sklearn_cosine(x: np.array, y: np.array) -> int:
    """is a function that takes in two arguments, x and y, 
    and returns the cosine similarity between them as calculated by 
    the cosine_similarity function from the scikit-learn library.

    :param x: first vector
    :type x: np.array 
    :param y: second vector
    :type y: np.array
    :return: scalar similarity value 
    :rtype: int
    """
    return cosine_similarity([x], [y])


def get_similar(gold_song_vectorized_lyric: np.array, vectorized_lyrics_to_compare: list, top_n_simlar: int=3):
    """is a function that takes in a "gold song vectorized lyric", a list of "vectorized lyrics to compare", 
    and an optional argument top_n_similar (defaults to 3), and returns the top top_n_similar cosine similarities 
    between the "gold song vectorized lyric" and the "vectorized lyrics to compare". 
    The function calculates cosine similarities between the "gold song vectorized lyric" 
    and each "vectorized lyric to compare" using the sklearn_cosine function, sorts the cosine 
    similarities in decreasing order, and returns the top top_n_similar similarities.

    :param vectorized_lyric: lyric to find similar songs for 
    :type vectorized_lyric: np.array
    :param vectorized_lyrics_to_compare: lyrics of songs to compare to vectorized_lyric
    :type vectorized_lyrics_to_compare: list
    :param top_n_simlar: number of most similar songs to return
    :type top_n_simlar: int
    :return: _description_
    :rtype: _type_
    """
    cosine_similarity_scores = []

    # calculate similarity score for each passed vectorized lyrics with gold_song_vectorized_lyric
    for vectorized_lyric in vectorized_lyrics_to_compare:
        # append similarity score to list of all scores
        cosine_similarity_scores.append(sklearn_cosine(gold_song_vectorized_lyric, vectorized_lyric)[0][0])
    cosine_similarity_scores = np.array(cosine_similarity_scores)

    # get indexes of top n values
    indexes_top_n = np.argsort(cosine_similarity_scores)[::-1][:top_n_simlar]

    # TODO: get top n song names instead of values
    top_n_values = cosine_similarity_scores[indexes_top_n]
    
    return top_n_values


In [59]:
x = np.random.rand(100)
y = np.random.rand(100)
z = np.random.rand(100)
a = np.random.rand(100)
b = np.random.rand(100)

get_similar(x, [y, z, a, b])

array([0.74225128, 0.73378324])