### Music Recommendation System - Youtube Data

#### Import Libraries

In [1]:
#Import required libraries
import numpy as np
import pandas as pd
import sklearn
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Load the Dataset

In [2]:
#Load rating dataset
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRO7Ioi4w6UHtRcu_wcyiAz6AzPUvllvbbASoLbV5QvpdZZqkhXAINRHdZqEoIoDHGFKo_dwHAGKtCi/pub?gid=1176714543&single=true&output=csv"
ratings = pd.read_csv(url)
print(ratings.head())

   userId  musicId  rating  likes
0      35        1     5.0      3
1      43        1     5.0      3
2      57        1     5.0      3
3      72        1     5.0      3
4     132        1     5.0      3


In [3]:
#Load music dataset
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRO7Ioi4w6UHtRcu_wcyiAz6AzPUvllvbbASoLbV5QvpdZZqkhXAINRHdZqEoIoDHGFKo_dwHAGKtCi/pub?gid=0&single=true&output=csv"
music = pd.read_csv(url)
print(music.head())

   musicId                                              title  \
0        1                         Despacito ft. Daddy Yankee   
1        2  See You Again ft. Charlie Puth [Official Video...   
2        3                Shape of You (Official Music Video)   
3        4                                      GANGNAM STYLE   
4        5        Uptown Funk (Official Video) ft. Bruno Mars   

                                           genre       artist          views  \
0                                          Latin   Luis Fonsi  8,695,428,918   
1                                            Rap  Wiz Khalifa  6,633,646,261   
2  Pop|Folk Pop|Dance-Pop|Folk|Singer-Songwriter   Ed Sheeran  6,443,757,340   
3                            Hiphop|Kpop|Pop Rap          Psy  5,542,341,741   
4                      Rock|Pop|Alternative Rock  Mark Ronson  5,536,545,460   

   year  
0  2017  
1  2015  
2  2017  
3  2012  
4  2014  


#### Statistical Checks

In [4]:
#Brief Stat Check
n_ratings = len(ratings) #number of ratings
n_songs = len(ratings['musicId'].unique()) #number of songs
n_users = len(ratings['userId'].unique()) #number of unique users

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique musicId's: {n_songs}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per song: {round(n_ratings/n_songs, 2)}")

Number of ratings: 51472
Number of unique musicId's: 4830
Number of unique users: 325
Average ratings per user: 158.38
Average ratings per song: 10.66


In [5]:
#User Rating Frequency
user_freq = ratings[['userId', 'musicId']].groupby(
    'userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
print(user_freq.head())

   userId  n_ratings
0       1        160
1       2        161
2       3        161
3       4        163
4       5        155


In [6]:
#Song Rating Stat

#Find Lowest and Highest rated songs:
mean_rating = ratings.groupby('musicId')[['rating']].mean()

#Lowest rated songs
lowest_rated = mean_rating['rating'].idxmin()
music.loc[music['musicId'] == lowest_rated]

#Highest rated songs
highest_rated = mean_rating['rating'].idxmax()
music.loc[music['musicId'] == highest_rated]

#Show information about the highest rated song
print(ratings[ratings['musicId']==highest_rated])
print("---------------------------------------")

#Show information about the lowest rated song
print(ratings[ratings['musicId']==lowest_rated])

   userId  musicId  rating  likes
0      35        1     5.0      3
1      43        1     5.0      3
2      57        1     5.0      3
3      72        1     5.0      3
4     132        1     5.0      3
5     154        1     5.0      0
6     155        1     5.0      0
7     164        1     5.0      0
8     228        1     5.0      0
---------------------------------------
       userId  musicId  rating  likes
42427      70     3963     1.0      0
42428      83     3963     1.0      0


#### Music Genre Preprocessing

In [7]:
#Preprocess Genres
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres into lists (e.g., ["Pop", "Rock", ...])
music['genres'] = music['genre'].str.split('|')

#One-hot encode genres
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(music['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=music['musicId'])

#### Create CSR Matrix For Ratings & Likes

In [8]:
#Create user-item matrix using scipy csr matrix
def create_matrix(df):
    """
    Creates a sparse matrix where rows = music, columns = users, values = ratings.
    """

    N = len(df['userId'].unique())
    M = len(df['musicId'].unique())

    #Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    music_mapper = dict(zip(np.unique(df["musicId"]), list(range(M))))

    #Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    music_inv_mapper = dict(zip(list(range(M)), np.unique(df["musicId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    music_index = [music_mapper[i] for i in df['musicId']]

    X = csr_matrix((df["rating"], (music_index, user_index)), shape=(M, N))

    return X, user_mapper, music_mapper, user_inv_mapper, music_inv_mapper

X, user_mapper, music_mapper, user_inv_mapper, music_inv_mapper = create_matrix(ratings)

In [9]:
#Create X_likes (Sparse Matrix for Likes)
def create_likes_matrix(df):
    """
    Creates a sparse matrix where rows = music, columns = users, values = likes.
    """
    N = len(df['userId'].unique())
    M = len(df['musicId'].unique())
    
    #Map Ids to indices
    user_mapper_likes = dict(zip(np.unique(df["userId"]), list(range(N))))
    music_mapper_likes = dict(zip(np.unique(df["musicId"]), list(range(M))))

    
    user_index_likes = [user_mapper_likes[i] for i in df['userId']]
    music_index_likes = [music_mapper_likes[i] for i in df['musicId']]
    
    X_likes = csr_matrix((df["likes"], (music_index_likes, user_index_likes)), shape=(M, N))
    return X_likes

X_likes = create_likes_matrix(ratings)

#### Create Hybrid Similarity Metric Function

In [10]:
#Hybrid Similarity Metric Function
from sklearn.metrics.pairwise import cosine_similarity

def hybrid_similarity(music_id, X, X_likes, genres_df, rating_weight=0.5, likes_weight=0.3, genre_weight=0.2):
    """
    Computes hybrid similarity using ratings, likes, and genres.
    Returns: Combined similarity scores for all music relative to `music_id`.
    """
    #Get the index of the target song
    music_ind = music_mapper[music_id]
    
    #Rating-based similarity
    rating_sim = cosine_similarity(X[music_ind], X).flatten()
    
    #Likes-based similarity (assuming X_likes is a sparse matrix of likes)
    likes_sim = cosine_similarity(X_likes[music_ind], X_likes).flatten()
    
    #Genre-based similarity
    genre_sim = cosine_similarity(genres_df.loc[music_id].values.reshape(1, -1), genres_df).flatten()
    
    #Combine similarities with weights
    combined_sim = (
        (rating_weight * rating_sim) + 
        (likes_weight * likes_sim) + 
        (genre_weight * genre_sim)
    )
    
    return combined_sim

#### Find Similar Songs

In [11]:
#Find similar music with hybrid similarity model

def find_similar_music_hybrid(music_id, X, X_likes, genres_df, k=10, rating_weight=0.5, likes_weight=0.3, genre_weight=0.2):
    """
    Finds similar music using hybrid similarity (rating + likes + genre).
    Returns: List of similar music IDs.
    """
    #Compute hybrid similarity scores
    sim_scores = hybrid_similarity(
        music_id, 
        X, 
        X_likes, 
        genres_df, 
        rating_weight, 
        likes_weight, 
        genre_weight
    )
    
    #Get top-k most similar music (excluding itself)
    similar_music_indices = np.argsort(sim_scores)[-k-1:-1][::-1]  #Skip the highest (self)
    similar_music_ids = [music_inv_mapper[idx] for idx in similar_music_indices]
    
    return similar_music_ids

music_titles = dict(zip(music['musicId'], music['title']))

music_id = 12 #5 #595  #Sample music ID

#Get similar music IDs using hybrid similarity
similar_ids = find_similar_music_hybrid(
        music_id, 
        X,  #Sparse matrix of ratings
        X_likes,    #Sparse matrix of likes
        genres_df,   #One-hot encoded genres DataFrame
        k=10,        #Number of recommendations
        rating_weight=0.5, 
        likes_weight=0.3, 
        genre_weight=0.2
)

#Get the title and genre of the input music
music_title = music_titles.get(music_id, "Unknown Song")
music_genre = music.loc[music['musicId'] == music_id, 'genres'].iloc[0]

#Print the results with genres
print(f"Since you listened to {music_title} ({music_genre}), here are similar songs:")
for song_id in similar_ids:
    song_title = music_titles.get(song_id, "Unknown Song")
    song_genre = music.loc[music['musicId'] == song_id, 'genres'].iloc[0]
    print(f"- {song_title} ({song_genre})")

Since you listened to Sorry (PURPOSE : The Movement) (['Pop', 'Teen Pop']), here are similar songs:
- Speed Of Sound (Official Video) (['Rock', 'Pop', 'Alternative Rock'])
- NETFLIXXX (Video Oficial) (['Pop', 'Hip Hop', 'Spanish'])
- Daddy Says No (['Pop', 'Australian', 'Singer-Songwriter'])
- Earned It (from Fifty Shades Of Grey) (Official Lyric Video) (['Pop', 'Synth-Pop', 'Rnb'])
- I'm Good (Blue) [Official Music Video] (['Pop', 'The Sims'])
- Confident ft. Chance The Rapper (['Pop', 'Teen Pop'])
- Prayer In C (Robin Schulz Remix) (Official) (['Pop', 'Alternative Rock', 'Folk'])
- Intentions (Official Video (Short Version)) ft. Quavo (['Pop', 'Teen Pop'])
- Baby ft. Ludacris (['Pop', 'Teen Pop'])
- Permission To Dance (['Pop', 'Korean', 'K-Pop'])


In [12]:
#Find similar music using KNN model

def find_similar_music(music_id, X, k, metric='cosine', show_distance=False):

    neighbour_ids = []

    music_ind = music_mapper[music_id]
    music_vec = X[music_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    music_vec = music_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(music_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(music_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids


music_titles = dict(zip(music['musicId'], music['title']))

music_id = 5 #595

similar_ids = find_similar_music(music_id, X, k=10)
music_title = music_titles[music_id]

print(f"Since you listened to {music_title} ({music.loc[music['musicId'] == music_id, 'genres'].iloc[0]}), here is a list of similar songs:")
for i in similar_ids:
    print(f"- {music_titles[i]} ({music.loc[music['musicId'] == i, 'genres'].iloc[0]})")

Since you listened to Uptown Funk (Official Video) ft. Bruno Mars (['Rock', 'Pop', 'Alternative Rock']), here is a list of similar songs:
- MEJOR ME ALEJO (VIDEO OFICIAL) (['Banda', 'Pop', 'Regional Mexican', 'Corrido'])
- Sin Querer (Video Oficial) (['Pop', 'Hip Hop', 'Dance-Pop'])
- Tonight (I'm Lovin' You) ft. Ludacris, DJ Frank E (['Latin Pop', 'Latin'])
- Blood Sweat & Tears (['Pop', 'Korean', 'K-Pop'])
- Hey Ma ft Camila Cabello (Spanish Version | The Fate of the Furious: The Album) (['Pop', 'Hip Hop', 'Spanish'])
- Ll√É¬©vame Contigo (Live from Madison Square Garden) (['Bachata', 'Bachata Pop', 'Latin'])
- Baby I (Official Video) (['Pop', 'R&B', 'Dance-Pop', 'Contemporary R&B', 'Trap Soul'])
- Mine (['Pop', 'Singer-Songwriter', 'Country'])
- Try (The Truth About Love - Live From Los Angeles) (['pop, pop rock, dance-pop, contemporary r&b, electropop'])
- Paradinha [Official Music Video] (['Brazilian Pop', 'Funk Pop', 'Funk Carioca', 'Funk', 'Pagode Baiano'])


#### Recommend New Songs

In [13]:
#Create function to recommend songs based on the users preference using hybrid model

def recommend_music_for_user_hybrid(user_id, X, X_likes, genres_df, k=10):
    """
    Recommends music based on the user's highest-rated music, using hybrid similarity.
    """
    #Get the user's ratings
    user_ratings = ratings[ratings['userId'] == user_id]
    if user_ratings.empty:
        print(f"User {user_id} not found.")
        return
    
    #Get the highest-rated music ID
    highest_rated_music_id = user_ratings.loc[user_ratings['rating'].idxmax(), 'musicId']
    
    #Verify the music exists in the catalog
    if highest_rated_music_id not in music_titles:
        print(f"Music ID {highest_rated_music_id} not found in catalog.")
        return
    
    #Get music details
    music_title = music_titles[highest_rated_music_id]
    music_genre = music.loc[music['musicId'] == highest_rated_music_id, 'genres'].iloc[0]
    
    #Find similar music
    similar_ids = find_similar_music_hybrid(
        highest_rated_music_id, 
        X, 
        X_likes, 
        genres_df, 
        k=k
    )
    
    #Display recommendations
    print(f"Since you listened to {music_title} ({music_genre}), you might also like:")
    for song_id in similar_ids:
        if song_id in music_titles:  #Verify song exists before displaying
            song_title = music_titles[song_id]
            song_genre = music.loc[music['musicId'] == song_id, 'genres'].iloc[0]
            print(f"- {song_title} ({song_genre})")
        else:
            print(f"- Unknown song (ID: {song_id})")

In [14]:
#Create function to recommend songs based on the users preference using the KNN model.
def recommend_music_for_user(user_id, X, user_mapper, music_mapper, music_inv_mapper, k=10):
    df1 = ratings[ratings['userId'] == user_id]

    if df1.empty:
        print(f"User with ID {user_id} does not exist.")
        return

    music_id = df1[df1['rating'] == max(df1['rating'])]['musicId'].iloc[0]

    music_titles = dict(zip(music['musicId'], music['title']))

    similar_ids = find_similar_music(music_id, X, k)
    music_title = music_titles.get(music_id, "Song not found")

    if music_title == "Song not found":
        print(f"Song with ID {music_id} not found.")
        return

    print(f"Since you listened to {music_title} ({music.loc[music['musicId'] == music_id, 'genres'].iloc[0]}), you might also like:")
    for i in similar_ids:
        print(f"- {music_titles.get(i, 'Song not found')} ({music.loc[music['musicId'] == i, 'genres'].iloc[0]})")

In [15]:
recommend_music_for_user_hybrid(user_id=5, X=X, X_likes=X_likes, genres_df=genres_df, k=10)

Since you listened to Sugar (Official Music Video) (['Pop']), you might also like:
- All In My Head (Flex) (Official Video) ft. Fetty Wap (['Pop'])
- Treat You Better (['Pop'])
- Misery (Official Music Video) (['Pop'])
- J'oublie tout [Son Officiel] (['Pop'])
- Look At Her Now (Official Music Video) (['Pop'])
- Aunque Ahora Est√É¬©s Con √É¬âl (['Banda', 'Corrido', 'Latin', 'Regional Mexicano', 'Sierreño'])
- Dive [Official Audio] (['Pop', 'Folk Pop', 'Dance-Pop', 'Folk', 'Singer-Songwriter'])
- Somebody To You ft. Demi Lovato (['Pop'])
- Min Awel Dekika [Official Video] (2022) / √ò¬ß√ô¬Ñ√ô¬ä√ò¬≥√ò¬ß √ô¬à√ò¬≥√ò¬π√ò¬Ø √ô¬Ñ√ô¬Ö√ò¬¨√ò¬±√ò¬Ø - √ô¬Ö√ô¬Ü √ò¬£√ô¬à√ô¬Ñ √ò¬Ø√ô¬Ç√ô¬ä√ô¬Ç√ò¬© (['Pop'])
- Mercy (Official Music Video) (['Pop'])


In [16]:
user_id = 5  #Replace with the desired user ID
recommend_music_for_user(user_id, X, user_mapper, music_mapper, music_inv_mapper, k=10)

Since you listened to Sugar (Official Music Video) (['Pop']), you might also like:
- Aunque Ahora Est√É¬©s Con √É¬âl (['Banda', 'Corrido', 'Latin', 'Regional Mexicano', 'Sierreño'])
- Litr√É¬£o (Ao Vivo Em Recife / 2020) (['Country', 'Pop', 'Sertanejo'])
- Si T√É¬∫ Supieras (Video Oficial) (['Mariachi', 'Ranchera', 'Latin Pop', 'Bolero'])
- Ms. Jackson (MattyBRaps Cover) (['Southern Hip Hop', 'Hip Hop'])
- Rabiosa (English Version) ft. Pitbull (['Latin Pop'])
- Dive [Official Audio] (['Pop', 'Folk Pop', 'Dance-Pop', 'Folk', 'Singer-Songwriter'])
- Rojo (Official Video) (['Reggaeton', 'Latin'])
- Piu√É¬≠ Abacaxi (DVD Colet√É¬¢nea de Sucessos) (['Lullaby', 'Childrens Music'])
- Necio (Audio) ft. Santana (['Bachata', 'Bachata Pop', 'Latin'])
- Quarta Cadeira (Ao Vivo Em Goi√É¬¢nia / 2018) ft. Jorge & Mateus (['Country', 'Pop', 'Sertanejo'])
