In [1]:
import numpy as np
import pandas as pd

In [2]:
from typing import List, Dict

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
songs = pd.read_csv('songdata.csv')

In [12]:
songs.head()

Unnamed: 0,artist,song,text
0,Gipsy Kings,A Tu Vera,To your side To your side always the verita yo...
1,Roxette,Staring At The Ground,"I'm staring at the gorund, \nI'm bloodless, t..."
2,Dean Martin,Deep Purple,When the deep purple falls \nOver sleepy gard...
3,Roxette,Beautiful Things,Beautiful things \nAre comin' my way \nBeaut...
4,Lionel Richie,Through My Eyes,Oh yeah \nOh \nWow \n \nBaby we were so in...


In [6]:
songs = songs.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [7]:
songs['text'] = songs['text'].str.replace(r'\n', '')

In [8]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

In [9]:
lyrics_matrix = tfidf.fit_transform(songs['text'])

In [10]:
cosine_similarities = cosine_similarity(lyrics_matrix)

In [11]:
similarities = {}

In [13]:
for i in range(len(cosine_similarities)):
    similar_indices = cosine_similarities[i].argsort()[:-50:-1]
    similarities[songs['song'].iloc[i]] = [(cosine_similarities[i][x], songs['song'][x], songs['artist'][x]) for x in similar_indices][1:]

In [14]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, song, recom_song):
        rec_items = len(recom_song)

        print(f'The {rec_items} recommended songs for {song} are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_song[i][1]} by {recom_song[i][2]} with {round(recom_song[i][0], 3)} similarity score")
            print("--------------------")

    def recommend(self, recommendation):
        # Get song to find recommendations for
        song = recommendation['song']
        # Get number of songs to recommend
        number_songs = recommendation['number_songs']
        recom_song = self.matrix_similar[song][:number_songs]
        self._print_message(song=song, recom_song=recom_song)

In [15]:
recommedations = ContentBasedRecommender(similarities)

In [16]:
recommendation = {
    "song": songs['song'].iloc[10],
    "number_songs": 4
}

In [17]:
recommedations.recommend(recommendation)

The 4 recommended songs for The Dance are:
Number 1:
Fancy Dancer by Bread with 0.421 similarity score
--------------------
Number 2:
Shadow Dancer by Ufo with 0.313 similarity score
--------------------
Number 3:
Mining Town by Whiskeytown with 0.21 similarity score
--------------------
Number 4:
What's Wrong With This Picture? by Squeeze with 0.193 similarity score
--------------------


In [18]:
recommendation2 = {
    "song": songs['song'].iloc[120],
    "number_songs": 4
}

In [19]:
recommedations.recommend(recommendation2)

The 4 recommended songs for Possibility Days are:
Number 1:
Looking For Something by Vonda Shepard with 0.297 similarity score
--------------------
Number 2:
What's The Good In Goodbye? by Natalie Imbruglia with 0.275 similarity score
--------------------
Number 3:
Never Say Goodbye by Yoko Ono with 0.256 similarity score
--------------------
Number 4:
Not A Day Goes By by Carly Simon with 0.246 similarity score
--------------------
