In [2]:
import numpy as np
import pandas as pd
import jax.numpy as jnp
from jax import grad, jit, lax, random, vmap
from jax.scipy.special import expit as sigmoid
from sklearn.model_selection import train_test_split
import jax
import numpy as np
import jax.numpy as jnp
import pylab as plt
from scipy.sparse import csr_matrix
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('songsDataset.csv')
df.columns = ['userID', 'songID', 'rating']
# basic data analysis: number of users and songs
n_users, n_songs = len(set(df["userID"].tolist())), len(set(df["songID"].tolist()))
all_ratings = set(df["rating"].tolist())
n_rating_levels = len(all_ratings)
# Number of songs a user typically rates
user_rating_counts = df.groupby('userID')['songID'].count().reset_index()
user_rating_counts.columns = ['userID', 'song_count']
print(f'A user can give ratings to {set(user_rating_counts["song_count"])} songs')

# Number of users a song is typically rated by
song_rating_counts = df.groupby('songID')['userID'].count().reset_index()
song_rating_counts.columns = ['songID', 'user_count']
print(f'A song can be rated by {set(song_rating_counts["user_count"])} users')
# generate consecutive indices for users and songs
all_song_idx, all_user_idx = sorted(list(set(df.songID.values))), sorted(list(set(df.userID.values)))
print(len(all_song_idx), len(all_user_idx))
song_idx_dict = {song_id: i for i, song_id in enumerate(all_song_idx)}
user_idx_dict = {user_id: i for i, user_id in enumerate(all_user_idx)}
df['userID'] = df['userID'].apply(lambda x: user_idx_dict[x])
df['songID'] = df['songID'].apply(lambda x: song_idx_dict[x])

A user can give ratings to {10} songs
A song can be rated by {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 2

In [4]:
def generate_rating_matrix(df, total_df):
    # create rating matrix of shape (number of users, number of songs)
    ratings = csr_matrix(
        (df.rating.values, (df.userID.values, df.songID.values)), 
        shape=(len(total_df.userID.unique()), len(total_df.songID.unique()))
    ).toarray()
    return ratings

train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)
ratings_train, ratings_val, ratings_test = generate_rating_matrix(train, df), \
    generate_rating_matrix(val, df), generate_rating_matrix(test, df)
print(ratings_train.shape, ratings_val.shape, ratings_test.shape)

(200000, 127771) (200000, 127771) (200000, 127771)


In [5]:
def init_model_params(rng, num_users, num_songs, embedding_dim):
    w_user = random.normal(rng, (num_users, embedding_dim))
    w_song = random.normal(rng, (num_songs, embedding_dim))
    return w_user, w_song

embedding_dim = 10
num_users=200000
num_songs=127771
rng = random.PRNGKey(0)
w_user, w_song = init_model_params(rng, num_users, num_songs, embedding_dim)

In [6]:
def model(params, user_ids, song_ids):
    w_user, w_song = params
    user_embed = w_user[user_ids]
    song_embed = w_song[song_ids]
    return jnp.sum(user_embed * song_embed, axis=-1)

def mse_loss(params, data):
    user_ids, song_ids, ratings = data[:, 0], data[:, 1], data[:, 2]
    predicted_ratings = model(params, user_ids, song_ids)
    return jnp.mean((predicted_ratings - ratings) ** 2)

@jit
def update(params, data, lr=0.01):
    grad_loss = grad(mse_loss)(params, data)
    return [(param - lr * grad_param) for param, grad_param in zip(params, grad_loss)]

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming df is your dataframe with columns 'user_id', 'rating', and 'song_id'

# Splitting into train and test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Splitting train data into train and validation
#train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Calculate average rating for each user and each item
user_means = train_data.groupby('userID')['rating'].mean()
item_means = train_data.groupby('songID')['rating'].mean()

# Predict ratings for validation data using average ratings
test_data['user_mean_rating'] = test_data['userID'].map(user_means)
test_data['item_mean_rating'] = test_data['songID'].map(item_means)

# Fill missing user or item means with global mean
global_mean = train_data['rating'].mean()
test_data['user_mean_rating'].fillna(global_mean, inplace=True)
test_data['item_mean_rating'].fillna(global_mean, inplace=True)

# Calculate predicted ratings using average of user and item means
test_data['predicted_rating'] = (test_data['user_mean_rating'] + test_data['item_mean_rating']) / 2

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(test_data['rating'], test_data['predicted_rating'])

print("Mean Squared Error (MSE) for validation data (baseline model):", mse)

Mean Squared Error (MSE) for validation data (baseline model): 1.804611342330206


In [11]:
from sklearn.metrics import classification_report

# Round to integer and limit values to 1 - 5
true,pred = test_data['rating'].tolist(), test_data['predicted_rating'].tolist()
rounded_pred = np.round(pred)
rounded_pred = np.clip(rounded_pred, 1, 5)
# Compute classification report
report = classification_report(true, rounded_pred, labels=[1, 2, 3, 4, 5])

In [13]:
print(report)

              precision    recall  f1-score   support

           1       0.89      0.02      0.03     80077
           2       0.12      0.09      0.10     37303
           3       0.19      0.54      0.28     57560
           4       0.19      0.55      0.29     69007
           5       0.85      0.05      0.09    156053

    accuracy                           0.20    400000
   macro avg       0.45      0.25      0.16    400000
weighted avg       0.58      0.20      0.14    400000

