In [5]:
import numpy as np
import pandas as pd
from collections import Counter 
from scipy import sparse
from sklearn.model_selection import train_test_split

In [6]:
anime_ratings_df = pd.read_csv("Downloads/rating.csv")
anime_ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [7]:
#lets get rid of the missing ratings i.e. -1
arating = anime_ratings_df.loc[anime_ratings_df.rating != -1].reset_index()[['user_id','anime_id','rating']]
arating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10


In [8]:
#distribution of rating
Counter(arating.rating)

Counter({10: 955715,
         8: 1646019,
         6: 637775,
         9: 1254096,
         7: 1375287,
         3: 41453,
         5: 282806,
         4: 104291,
         1: 16649,
         2: 23150})

In [9]:
#average number of ratings per user
np.mean(arating.groupby(['user_id']).count()['anime_id'])

91.05231321839081

In [10]:
train_df, valid_df = train_test_split(arating, test_size=0.3)

#resetting indices
train_df = train_df.reset_index()[['user_id', 'anime_id', 'rating']]
valid_df = valid_df.reset_index()[['user_id', 'anime_id', 'rating']]

To create our user and item embeddings, we need continuous IDs to be able to index into the embedding matrix and access each user/item embedding

In [12]:
def encode_column(column):
    #encoding column with continous IDs
    
    keys = column.unique()
    key_to_id = {key:index for index, key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [16]:
def encode_df(anime_df):
    #encodes rating data with continous user and anime ids
    
    anime_ids, anime_df['anime_id'], num_anime = encode_column(anime_df['anime_id'])
    user_ids, anime_df['user_id'], num_users = encode_column(anime_df['user_id'])
    return anime_df, num_users, num_anime, user_ids, anime_ids

In [17]:
anime_df, num_users, num_anime, user_ids, anime_ids = encode_df(train_df)
print("the total number of users :", num_users)
print("the total number of anime :", num_anime)

the total number of users : 68391
the total number of anime : 9617


Now we shall initialize user and item embeddings

In [18]:
def create_embeddings(n, f):
    #here f is the number of factors in the embedding and n is the number of items
    # we are creating a random numpy matrix of shape n,f
    
    return 11*np.random.random((n,f))/f

In [20]:
def create_sparse_matrix(df, rows, cols, col_name = "rating"):
    return sparse.csc_matrix((df[col_name].values,(df['user_id'].values, df['anime_id'].values)),shape=(rows, cols))

In [21]:
anime_df, num_users, num_anime, user_ids, anime_ids = encode_df(train_df)
Y = create_sparse_matrix(anime_df, num_users, num_anime)

Now we make predictions

In [23]:
def predict(df, emb_user, emb_anime):
    
    df['prediction'] = np.sum(np.multiply(emb_anime[df['anime_id']],emb_user[df['user_id']]), axis=1)
    return df

In [29]:
laambda = 0.0003 #cost
def cost(df, emb_user, emb_anime):
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

# Gradient Descent

In [30]:
def gradient(df, emb_user, emb_anime):
    #Computes the gradient for user and anime embeddings
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_anime) + 2*laambda*emb_user
    grad_anime = (-2/df.shape[0])*(delta.T*emb_user) + 2*laambda*emb_anime
    return grad_user, grad_anime

In [31]:
def gradient_desc(df,emb_user, emb_anime, iterations = 1800, learning_rate = 0.02, df_val = None):
    
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    beta = 0.8
    grad_user, grad_anime = gradient(df, emb_user, emb_anime)
    g_user = grad_user
    g_anime = grad_anime
    for i in range(iterations):
        grad_user, grad_anime = gradient(df, emb_user, emb_anime)
        g_user = beta*g_user + (1-beta)*grad_user
        g_anime = beta*g_anime + (1-beta)*grad_anime
        emb_user = emb_user - learning_rate*g_user
        emb_anime = emb_anime - learning_rate*g_anime
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_anime))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_anime))
    return emb_user, emb_anime
    

In [32]:
emb_user = create_embeddings(num_users, 3)
emb_anime = create_embeddings(num_anime, 3)
emb_user, emb_anime = gradient_desc(anime_df, emb_user, emb_anime, iterations=900, learning_rate=0.9)


iteration 50 :
train mse: 16.382841554013293

iteration 100 :
train mse: 12.31153814670948

iteration 150 :
train mse: 10.430926780865828

iteration 200 :
train mse: 9.356311298737898

iteration 250 :
train mse: 8.673233367517696

iteration 300 :
train mse: 8.211226031616302

iteration 350 :
train mse: 7.886732316843702

iteration 400 :
train mse: 7.653900223676979

iteration 450 :
train mse: 7.485513621704669

iteration 500 :
train mse: 7.3644612809248455

iteration 550 :
train mse: 7.279500956398716

iteration 600 :
train mse: 7.222982785293564

iteration 650 :
train mse: 7.189547441929115

iteration 700 :
train mse: 7.175343807546137

iteration 750 :
train mse: 7.177539471302398

iteration 800 :
train mse: 7.194004125191452

iteration 850 :
train mse: 7.223099097891278

iteration 900 :
train mse: 7.263534243558595


To avoid cold case problem, we remove the data we have not encountered before.

In [44]:
def encode_new(valid_df, user_ids, anime_ids):
   
    df_val = valid_df['anime_id'].isin(anime_ids.keys()) & valid_df['user_id'].isin(user_ids.keys())
    valid_df = valid_df[df_val]
    valid_df['anime_id'] =  np.array([anime_ids[x] for x in valid_df['anime_id']])
    valid_df['user_id'] = np.array([user_ids[x] for x in valid_df['user_id']])
    return valid_df

In [45]:
print("before encoding:", valid_df.shape)
valid_df = encode_new(valid_df, user_ids, anime_ids)
print("after encoding:", valid_df.shape)

before encoding: (1106669, 4)
after encoding: (1106669, 4)


In [46]:
train_mse = cost(train_df, emb_user, emb_anime)
val_mse = cost(valid_df, emb_user, emb_anime)
print(train_mse, val_mse)

7.263534243558595 15.498480884859747


In [47]:
valid_df[60:70].head()

Unnamed: 0,user_id,anime_id,rating,prediction
104,35919,3167,8,4.173755
105,52014,389,10,6.233698
107,54927,853,8,6.831016
110,15393,6702,10,4.549511
111,30904,6166,7,3.542834
