<a href="https://colab.research.google.com/github/NishiKlair/NishiKlair/blob/main/song_recommendation_systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
#import required datasets

count_df = pd.read_csv('/content/count_data.csv')
song_df = pd.read_csv('/content/song_data.csv')

In [6]:
count_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730442 entries, 0 to 730441
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  730442 non-null  int64  
 1   user_id     730442 non-null  object 
 2   song_id     730441 non-null  object 
 3   play_count  730441 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 22.3+ MB


In [7]:
count_df.shape

(730442, 4)

In [8]:
count_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,song_id,play_count
0,0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0
1,1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0
2,2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0
3,3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1.0
4,4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1.0


In [9]:
song_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624629 entries, 0 to 624628
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   song_id      624629 non-null  object 
 1   title        624621 non-null  object 
 2   release      624625 non-null  object 
 3   artist_name  624628 non-null  object 
 4   year         624626 non-null  float64
dtypes: float64(1), object(4)
memory usage: 23.8+ MB


In [10]:
song_df.shape

(624629, 5)

In [11]:
song_df.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003.0
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995.0
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006.0
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003.0
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0.0


In [23]:
df = pd.merge(count_df, song_df.drop_duplicates(['song_id']), 
              on='song_id', how='left')

In [24]:
df.columns

Index(['Unnamed: 0', 'user_id', 'song_id', 'play_count', 'title', 'release',
       'artist_name', 'year'],
      dtype='object')

In [25]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,user_id,song_id,play_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0,The Cove,Thicker Than Water,Jack Johnson,0.0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976.0
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007.0
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1.0,,,,
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1.0,,,,


In [26]:
# label encoding for columns - user_id, song_id

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['user_id'] = le.fit_transform(df['user_id'])
df['song_id'] = le.fit_transform(df['song_id'])

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730442 entries, 0 to 730441
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      730442 non-null  int64  
 1   song_id      730442 non-null  int64  
 2   play_count   730441 non-null  float64
 3   title        460589 non-null  object 
 4   release      460589 non-null  object 
 5   artist_name  460589 non-null  object 
 6   year         460589 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 44.6+ MB


In [35]:
df['user_id'].nunique() #total 730442 data points

28133

In [36]:
df['song_id'].nunique() #total 730442 data points

10001

In [27]:
# reduce dataset using meaningful logical assumptions
# eg: only include users who have heard atleast 90 songs - MIN_COUNT = 90

user_counts = dict() # dictionary containing number of counts for each user
users = df['user_id']

for user in users:
  if user in user_counts:
    user_counts[user] += 1
  else:
    user_counts[user] = 1

In [37]:
# remove users who has user_count < 90

users_removed = []
MIN_COUNT = 90

for user, no_of_counts in user_counts.items():
  if no_of_counts < MIN_COUNT:
    users_removed.append(user)


df = df.loc[~df['user_id'].isin(users_removed)]
df.shape

(157840, 7)

In [39]:
## only include song that have been heard by at least 120 users - MIN_COUNT = 120

song_counts = dict()
songs = df['song_id']

for song in songs:
  if song in song_counts:
    song_counts[song] += 1
  else:
    song_counts[song] = 1

# remove songs with song_counts < 120

songs_removed = []
MIN_COUNT = 120

for song, no_of_counts in song_counts.items():
  if no_of_counts < MIN_COUNT:
    songs_removed.append(song)

df = df.loc[~df['song_id'].isin(songs_removed)]
df.shape


(17974, 7)

In [44]:
df.groupby(['play_count']).count()

Unnamed: 0_level_0,user_id,song_id,title,release,artist_name,year
play_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,8539,8539,5135,5135,5135,5135
2.0,3580,3580,2063,2063,2063,2063
3.0,1793,1793,1018,1018,1018,1018
4.0,961,961,543,543,543,543
5.0,766,766,465,465,465,465
...,...,...,...,...,...,...
114.0,1,1,1,1,1,1
141.0,1,1,1,1,1,1
194.0,1,1,0,0,0,0
215.0,1,1,1,1,1,1


In [46]:
# most data lies in range 1-5 for play_counts

df_final = df[df['play_count'] <= 5]

In [144]:
df_final.loc[df['user_id'] == 6958]


Unnamed: 0,user_id,song_id,play_count,title,release,artist_name,year


In [48]:
## Exploratory Data Analysis ##
df_final['user_id'].nunique()

1126

In [49]:
df_final['song_id'].nunique()

103

In [51]:
df_final['artist_name'].nunique()

54

In [56]:
df_final['artist_name'].value_counts().head()

Florence + The Machine    711
Coldplay                  692
3 Doors Down              281
Kings Of Leon             268
Vampire Weekend           260
Name: artist_name, dtype: int64

In [60]:
## Popularity-based recommendation system ##
# --suggests top n songs-- #

avg_count = df_final.groupby('song_id').mean()['play_count']
freq = df_final.groupby('song_id').count()['play_count']
df_avg_play = pd.DataFrame({'avg_count':avg_count, 'play_frequency':freq})


In [59]:
def top_n_songs(data, n, min_interactions=100):
  # songs with minimum number of interactions
  relevant = data[data['play_frequency'] > min_interactions]
  recommendations = relevant.sort_values('avg_count', ascending = False)

  return recommendations.index[:n]

In [62]:
# recommending top 10 songs
le.inverse_transform(list(top_n_songs(df_avg_play, 10)))

array(['SOBONKR12A58A7A7E0', 'SONYKOW12AB01849C9', 'SOLFXKT12AB017E3E0',
       'SOTCMDJ12A6D4F8528', 'SOFRQTD12A81C233C0', 'SOAXGDH12A8C13F8A1',
       'SOEGIYH12A6D4FC0E3', 'SOUSMXX12AB0185C24', 'SOBOUPA12A6D4F81F1',
       'SOPAYPV12AB017DB0C'], dtype=object)

In [64]:
## Recommendation System - Collaborative Filtering ##
# we only need interaction data for users and songs.

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from collections import defaultdict

In [65]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.2 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630197 sha256=0b1c6841eff68e967123654248f6608dca9c6858853116f4914aaeeaede40c4e
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [68]:
from surprise.dataset import Dataset
from surprise.reader import Reader

from surprise.model_selection import train_test_split

from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.matrix_factorization import SVD

from surprise import accuracy

from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

from surprise import CoClustering


In [69]:
reader  = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_final[['song_id', 'song_id', 'play_count']], reader)

trainset, testset = train_test_split(data, test_size=0.4, random_state=42)

In [83]:
#user-user cosine similarity
sim_option = {'name' : 'cosine', 
              'user_based' : True} 

# KNN Algorithm for creating user-user colaborative filtering
user_user_colab = KNNBasic(sim_options=sim_option, 
                           verbose=True, random_state = 1)
user_user_colab.fit(trainset)



Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fa192677d50>

In [95]:
# function that predicts and returns precision, recall, f1_score, rmse

def precision_recall_at_k(model, k=30, threshold = 1.5):
  user_est_true = defaultdict(list)

  predictions = model.test(testset)

  for uid, _, true_r, est, _ in predictions:
    user_est_true[uid].append((est, true_r))
  
  precisions = dict()
  recalls = dict()

  for uid, user_ratings in user_est_true.items():
    user_ratings.sort(key = lambda x : x[0], reverse = True)

    #no of relevant items
    n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

    #no of recommendations in top k items
    n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

    #no of relevant and recommendations in top k
    n_rel_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])

    precisions[uid] = n_rel_rec_k/n_rec_k if n_rec_k != 0 else 0
    recalls[uid] = n_rel_rec_k/n_rel if n_rel != 0 else 0
  
  precision = round((sum(p for p in precisions.values())/len(precisions)), 3)
  recall = round((sum(r for r in recalls.values())/len(recalls)), 3)
  rmse = accuracy.rmse(predictions)

  #print('RMSE: ', rmse)
  print('Precision: ', precision)
  print('Recall: ', recall)
  print('F1 Score: ', round((2*precision*recall)/(precision+recall), 3))


In [96]:
precision_recall_at_k(user_user_colab)

RMSE: 1.1536
Precision:  0.419
Recall:  0.482
F1 Score:  0.448


In [142]:
user_user_colab.predict(2537, 8029, r_ui=2, verbose=True)

user: 2537       item: 8029       r_ui = 2.00   est = 1.84   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid=2537, iid=8029, r_ui=2, est=1.835447085154002, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [143]:
user_user_colab.predict(2537, 8029, verbose=True)

user: 2537       item: 8029       r_ui = None   est = 1.84   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid=2537, iid=8029, r_ui=None, est=1.835447085154002, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [101]:
# tuning hyperparameters

param_grid = {'k': [10, 20, 30],
              'min_k': [3, 6, 9],
              'sim_options': {'name': ['cosine', 'pearson', 'pearson_baseline'], 
                              'user_based': [True], 
                              'min_support': [2, 4]}}
 
 # 3 fold cross-validation to tune the hyperparameters
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3, n_jobs=-1)
gs.fit(data)

#best rmse score
print(gs.best_score['rmse'])

#parameters for best score
print(gs.best_params['rmse'])

1.1558029490977655
{'k': 30, 'min_k': 3, 'sim_options': {'name': 'cosine', 'user_based': True, 'min_support': 2}}


In [117]:
# training model based on optimized parameters
sim_options = {'name': 'cosine',
               'user_based': True,
               'min_support': 2}

user_user_colab_optimized = KNNBasic(sim_options = sim_options, k=30, min_k=3, 
                                     random_state=1, verbose=True)
user_user_colab_optimized.fit(trainset)
precision_recall_at_k(user_user_colab_optimized)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.1588
Precision:  0.409
Recall:  0.469
F1 Score:  0.437


In [145]:
user_user_colab_optimized.predict(2537, 8029, r_ui=2, verbose=True)

user: 2537       item: 8029       r_ui = 2.00   est = 1.84   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid=2537, iid=8029, r_ui=2, est=1.835447085154002, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [146]:
user_user_colab_optimized.predict(2537, 8029, verbose=True)

user: 2537       item: 8029       r_ui = None   est = 1.84   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid=2537, iid=8029, r_ui=None, est=1.835447085154002, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [147]:
user_user_colab_optimized.get_neighbors(0, 5)

[1, 2, 3, 4, 5]

In [153]:
# recommendations based on optimized KNNBasic algorithm

def get_recommendations(data, user_id, top_n, algo):
  recommendations = []

  #user-item interaction matrix
  user_item_matrix = data.pivot(index = 'user_id', 
                                columns = 'song_id',
                                values = 'play_count')
  #songs not listened to yet
  no_interaction_songs = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].isnull()].index.tolist()

  #predicting these values
  for item_id in no_interaction_songs:
    est = algo.predict(user_id, item_id).est
    recommendations.append((item_id, est)) #passing a tuple

  # sort the recommendations
  recommendations.sort(key = lambda x: x[1], reverse=True)


  return recommendations[:top_n]



In [156]:

recommendations = get_recommendations(df_final, 2537, 5, user_user_colab_optimized)

In [157]:
pd.DataFrame(recommendations, columns = ['song_id', 'predicted_ratings'])

Unnamed: 0,song_id,predicted_ratings
0,52,1.835447
1,97,1.835447
2,154,1.835447
3,208,1.835447
4,310,1.835447


In [161]:
def ranking_songs(recommendations, final_ranking):
  ranked_songs = final_ranking.loc[[items[0] for items in recommendations]].sort_values('play_frequency', 
                                                                                        ascending=False)[['play_frequency']].reset_index()
  
  ranked_songs = ranked_songs.merge(pd.DataFrame(recommendations, 
                                                 columns = ['song_id', 'predicted_ratings']), 
                                    on = 'song_id',
                                    how = 'inner')
  
  ranked_songs['corrected_ratings'] = ranked_songs['predicted_ratings'] - 1 / np.sqrt(ranked_songs['play_frequency'])

  # sort the songs based on corrected play_counts
  ranked_songs = ranked_songs.sort_values('corrected_ratings', ascending=False)
  
  return ranked_songs

  

In [162]:
ranking_songs(recommendations, df_avg_play)

Unnamed: 0,song_id,play_frequency,predicted_ratings,corrected_ratings
0,208,178,1.835447,1.760494
1,52,158,1.835447,1.755891
2,154,153,1.835447,1.754602
3,310,149,1.835447,1.753524
4,97,129,1.835447,1.747402


In [163]:
# item-item collaboration
#Declaring the similarity options.
sim_options = {'name': 'pearson',
               'user_based': False}

#KNN algorithm is used to find desired similar items.
sim_item_item = KNNBasic(sim_options=sim_options, random_state=1, verbose=False)

# Train the algorithm on the trainset, and predict play_count for the testset
sim_item_item.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score with k =30.
precision_recall_at_k(sim_item_item)

RMSE: 1.1536
Precision:  0.419
Recall:  0.482
F1 Score:  0.448


In [165]:
sim_item_item.predict(2537, 8029, r_ui=2, verbose=True)

user: 2537       item: 8029       r_ui = 2.00   est = 1.84   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid=2537, iid=8029, r_ui=2, est=1.835447085154002, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [166]:
# SVD
svd = SVD(random_state=1)

# training the algorithm on the trainset
svd.fit(trainset)

# Let us compute precision@k and recall@k with k =30.
precision_recall_at_k(svd)

RMSE: 1.1499
Precision:  0.437
Recall:  0.504
F1 Score:  0.468


In [167]:
# set the parameter space to tune
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01],
              'reg_all': [0.2, 0.4, 0.6]}

# performing 3-fold gridsearch cross validation
gs_ = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

# fitting data
gs_.fit(data)

# best RMSE score
print(gs_.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_.best_params['rmse'])

1.1432476795900668
{'n_epochs': 30, 'lr_all': 0.001, 'reg_all': 0.6}


In [168]:
# building the optimized SVD model using optimal hyperparameter search
svd_optimized = SVD(n_epochs=30, lr_all=0.01, reg_all=0.2, random_state=1)

# training the algorithm on the trainset
svd_optimized=svd_optimized.fit(trainset)

# Let us compute precision@k and recall@k also with k =30.
precision_recall_at_k(svd_optimized)

RMSE: 1.1498
Precision:  0.437
Recall:  0.504
F1 Score:  0.468
