In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

###  Importing the data

In [19]:
users_history = pd.read_csv("/kaggle/input/million-song-dataset-spotify-lastfm/User Listening History.csv")
music_info  = pd.read_csv("/kaggle/input/million-song-dataset-spotify-lastfm/Music Info.csv")

### Grouping the songs of each user

In [20]:
user_song_list = users_history.groupby('user_id', observed=True)[['track_id', 'playcount']].apply(lambda x: list(zip(x['track_id'], x['playcount']))).to_dict()

### Removing the users with less than 50 songs

In [21]:
user_song_list = {user: songs for user, songs in user_song_list.items() if len(songs) >= 50}

### Deleting the data related to the users we recently removed

In [22]:
users_history = users_history[users_history['user_id'].isin(user_song_list.keys())] 

In [23]:
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix,coo_matrix

### Splitting the data

In [24]:
train, test = train_test_split(users_history, test_size=0.2, random_state=42)

### Creating mappings so we can find a song_id by index and vice versa

In [25]:
# For collaborative filtering (using users_history)
train['user_id'] = train['user_id'].astype('category')
train['track_id'] = train['track_id'].astype('category')

# Create mappings for user_id and track_id
cf_user_id_mapping = dict(enumerate(train['user_id'].cat.categories))
cf_track_id_mapping = dict(enumerate(train['track_id'].cat.categories))
cf_user_id_reverse_mapping = {v: k for k, v in cf_user_id_mapping.items()}
cf_track_id_reverse_mapping = {v: k for k, v in cf_track_id_mapping.items()}

### Using SVD

In [26]:
# Create Sparse User-Item Interaction Matrix
user_item_sparse = coo_matrix((train['playcount'],
                               (train['user_id'].cat.codes,
                                train['track_id'].cat.codes)))

# Apply SVD on the Sparse Matrix
svd = TruncatedSVD(n_components=10, random_state=42)
user_factors = svd.fit_transform(user_item_sparse)
item_factors = svd.components_.T

In [27]:
def recommend_songs_collaborative(user_id, user_item_matrix, user_factors, item_factors, music_info, n_recommendations):
    # Check if the user_id exists in the mapping
    user_code = cf_user_id_reverse_mapping.get(user_id)
    if user_code is None:
        print(f"User ID {user_id} not found in the user-item matrix.")
        return []
    
    # Collaborative Filtering Recommendations
    cf_predictions = np.dot(user_factors[user_code, :], item_factors.T)
    cf_indices = np.argsort(cf_predictions)[::-1]
    cf_recommended_tracks = [cf_track_id_mapping[i] for i in cf_indices[:n_recommendations]]
    
    #print(f"Collaborative Filtering Recommendations for user {user_id}: {cf_recommended_tracks}")
    
    return cf_recommended_tracks

In [28]:
def evaluate_model_collaborative(user_test_data, user_train_data, user_item_matrix, user_factors, item_factors, music_info, n_recommendations):
    precisions = []
    recalls = []

    for user, true_tracks in user_test_data.items():
        if user in user_train_data:
            recommended_tracks = recommend_songs_collaborative(user, user_item_matrix, user_factors, item_factors, music_info, n_recommendations)
            
            # Calculate precision and recall
            true_positives = len(set(recommended_tracks) & set(true_tracks))
            precision = true_positives / len(recommended_tracks) if recommended_tracks else 0
            recall = true_positives / len(true_tracks) if true_tracks else 0
            
            precisions.append(precision)
            recalls.append(recall)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    
    return avg_precision, avg_recall

In [29]:
user_train_data = train.groupby('user_id', observed=True)['track_id'].apply(list).to_dict()
user_test_data = test.groupby('user_id', observed=True)['track_id'].apply(list).to_dict()

### Evaluating the model

In [30]:
precision, recall = evaluate_model_collaborative(user_test_data, user_train_data, user_item_sparse, user_factors, item_factors, music_info,44)
print(f"Average Precision: {precision}")
print(f"Average Recall: {recall}")

Average Precision: 0.026990964488337887
Average Recall: 0.08094974404274556
