In [1]:
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.metrics import r2_score

In [2]:
def get_year(x):
    if pd.isna(x):
        return 0
    else:
        return(int(x[-4:]))

users = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.user', sep='|',
                 names=['user_id','age','gender','occupation','zip_code'])
users = users.drop(['zip_code'],axis=1)


data = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.data', sep='\t',
                 names=['user_id','movie_id','rating','tstamp']).sample(frac=1)
data['date'] = pd.to_datetime(data['tstamp'],unit='s')
data['liked'] = (data['rating'] >= 3).astype(int)
data = data.drop(['tstamp'],axis=1)

movies = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.item', sep='|',
                    names=['movie_id' ,'movie title' ,'release date' ,'video release date' ,
                           'IMDb URL' ,'unknown' ,'Action' ,'Adventure' ,'Animation' ,
                           'Children' ,'Comedy' ,'Crime' ,'Documentary' ,'Drama' ,'Fantasy' ,
                           'Film-Noir' ,'Horror' ,'Musical' ,'Mystery' ,'Romance' ,'Sci-Fi' ,
                           'Thriller' ,'War' ,'Western'],encoding='latin')

movies['year'] = movies['release date'].apply(get_year)
movies = movies.drop(['release date','video release date','IMDb URL'],axis=1)

movies = movies.set_index('movie_id')
users = users.set_index('user_id')
num_users = len(pd.unique(data['user_id']))
num_movie = len(pd.unique(data['movie_id']))
data_joined = data.join(movies,on='movie_id',how='inner').join(users,on='user_id',how='inner')
print(data_joined.head())
print(data_joined.dtypes)

       user_id  movie_id  rating                date  liked  \
93876      942       215       5 1998-03-30 18:38:37      1   
84992      942       234       4 1998-03-30 18:39:21      1   
88285      942       174       5 1998-03-30 18:40:09      1   
80481      942        97       5 1998-03-30 18:40:39      1   
92648      942       210       4 1998-03-30 18:39:44      1   

                                     movie title  unknown  Action  Adventure  \
93876                     Field of Dreams (1989)        0       0          0   
84992                                Jaws (1975)        0       1          0   
88285             Raiders of the Lost Ark (1981)        0       1          1   
80481                  Dances with Wolves (1990)        0       0          1   
92648  Indiana Jones and the Last Crusade (1989)        0       1          1   

       Animation  ...  Mystery  Romance  Sci-Fi  Thriller  War  Western  year  \
93876          0  ...        0        0       0         0  

In [None]:
g = sns.displot(data_joined['date'])
g.fig.set_figheight(6)
g.fig.set_figwidth(18)

In [4]:
#Date based split and Shuffle them. I thought it was already shuffled when I loaded it from CSV.
#But its not. So shuffle is important here
cutoff_date = '1998-03-01'
df_train = data_joined[data_joined['date'] < cutoff_date].sample(frac=1)
df_test = data_joined[data_joined['date'] >= cutoff_date].sample(frac=1)


In [None]:
X_train = df_train[['user_id','movie_id']].to_numpy() - 1
y_train = df_train['rating'].to_numpy()
X_test = df_test[['user_id','movie_id']].to_numpy() - 1
y_test = df_test['rating'].to_numpy()

sns.distplot(X_train[:,1])
sns.distplot(X_test[:,1])

In [None]:
print(len(np.unique(X_train[:,0])), len(np.unique(X_test[:,0])))
print(len(pd.unique(data['user_id'])))
print(len(np.unique(X_train[:,1])), len(np.unique(X_test[:,1])))
print(len(pd.unique(data['movie_id'])))

In [7]:
def collaborative_model(num_users, num_movies, emb_dim):
    x = tf.keras.Input((2))
    x_user = tf.keras.layers.Embedding(num_users,emb_dim,
#                                        embeddings_regularizer=tf.keras.regularizers.L2(1e-4),
                                       name='users')(x[:,0])
    x_movie = tf.keras.layers.Embedding(num_movies,emb_dim,
#                                        embeddings_regularizer=tf.keras.regularizers.L2(1e-4),
                                       name='movies')(x[:,1])
    out = tf.linalg.diag_part(tf.linalg.matmul(x_user,x_movie,transpose_b=True))
    
    return tf.keras.Model(inputs=x, outputs=out, name="colab_model")


In [None]:
model = collaborative_model(num_users,num_movie,64)

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=['mse'])

model.fit(X_train,y_train,32,40,validation_split=0.1,
callbacks=[tf.keras.callbacks.EarlyStopping(
    monitor='val_mse',
    patience=4,
    restore_best_weights=True
)])    
    
umat = model.get_layer('users').get_weights()[0]
mmat = model.get_layer('movies').get_weights()[0]

In [None]:
#Experiment - how new datapoints including unseen users and movies interact (cold start problem)
#holdout set
unseen_indices = ~(np.isin(df_test['movie_id'],df_train['movie_id']) & np.isin(df_test['user_id'],df_train['user_id']))
X_test_unseen = X_test[unseen_indices]
y_test_unseen = y_test[unseen_indices]
print(X_test_unseen.shape,y_test_unseen.shape,X_test.shape)
y_pred = model(X_test_unseen)
tf.keras.metrics.mean_squared_error(y_test_unseen, y_pred)
r2_score(y_test_unseen, y_pred)

In [None]:
#Experiment - how new datapoints including seen users and movies interact
seen_indices =    np.isin(df_test['movie_id'],df_train['movie_id']) & np.isin(df_test['user_id'],df_train['user_id'])
X_test_seen = X_test[seen_indices]
y_test_seen = y_test[seen_indices]
print(X_test_seen.shape,y_test_seen.shape,X_test.shape)
y_pred = model(X_test_seen)
tf.keras.metrics.mean_squared_error(y_test_seen, y_pred)
r2_score(y_test_seen, y_pred)

In [None]:
#TSNE for visualization
tsne = TSNE()
# users = tsne.fit_transform(model.get_layer('users').get_weights()[0])
movies_embeddings = tsne.fit_transform(model.get_layer('movies').get_weights()[0])
new_df = pd.DataFrame(movies_embeddings)


In [None]:
new_df[2] = movies['movie title'].to_numpy()
new_df.set_index(np.array(new_df.index)+1, inplace = True)
# new_df[3] = items.drop(columns=[0,1,2,3,4]).idxmax(axis=1)
new_df = pd.concat([new_df,movies[['unknown' ,'Action' ,'Adventure' ,'Animation' ,
                           'Children' ,'Comedy' ,'Crime' ,'Documentary' ,'Drama' ,'Fantasy' ,
                           'Film-Noir' ,'Horror' ,'Musical' ,'Mystery' ,'Romance' ,'Sci-Fi' ,
                           'Thriller' ,'War' ,'Western']]],axis=1)
fig = px.scatter(new_df, x=0, y=1,hover_data=[2],color='Action')
fig.show()

In [None]:
fig = px.scatter(new_df, x=0, y=1,hover_data=[2],color='War')
fig.show()

In [None]:
#train
ohe = OHE(handle_unknown='ignore',sparse=False)
X_train = np.concatenate([df_train.drop(['date','movie_id',
                                           'user_id','movie title',
                                           'rating','gender','occupation'],axis=1),
                         ohe.fit_transform(df_train[['gender','occupation']])],axis=1)

#test
X_test = np.concatenate([df_test.drop(['date','movie_id',
                                           'user_id','movie title',
                                           'rating','gender','occupation'],axis=1),
                         ohe.fit_transform(df_test[['gender','occupation']])],axis=1)
print(X_train.shape)

In [None]:
#Network does not seem to converge. Either too much randomness in the data or the network is not stong
#enough. Its as good at learning embeddings as initializing them randomly.
def neural_embeddings_collaborative_model(data_dimension, user_data_start_index,emb_dim,drop_out=0.2):
    x_data = tf.keras.Input((data_dimension))
    x_movie_in = x_data[:,:user_data_start_index]
    x_user_in = x_data[:,user_data_start_index:]
    drop_out_layer = tf.keras.layers.Dropout(drop_out)
    x_user = drop_out_layer(tf.keras.layers.Dense(2*emb_dim,activation='relu')(x_user_in))
    x_user = drop_out_layer(tf.keras.layers.Dense(2*emb_dim,activation='relu')(x_user))
    x_user = tf.keras.layers.Dense(emb_dim,activation='relu')(x_user)
    x_movie = drop_out_layer(tf.keras.layers.Dense(2*emb_dim,activation='relu')(x_movie_in))
    x_movie = drop_out_layer(tf.keras.layers.Dense(2*emb_dim,activation='relu')(x_movie))
    x_movie = tf.keras.layers.Dense(emb_dim,activation='relu')(x_movie)
    out = tf.linalg.diag_part(tf.linalg.matmul(x_user,x_movie,transpose_b=True))
    
    return tf.keras.Model(inputs=x_data, outputs=out, name="neural_embeddings_collaborative_model")


model = neural_embeddings_collaborative_model(X_train.shape[1],20,64)
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=['mse'])

model.fit(X_train,y_train,32,40,validation_split=0.1,
callbacks=[tf.keras.callbacks.EarlyStopping(
    monitor='val_mse',
    patience=60,
    restore_best_weights=True
)])    

In [None]:
#Experiment - how new datapoints including unseen users and movies interact (cold start problem)
#holdout set 

unseen_indices = ~(np.isin(df_test['movie_id'],df_train['movie_id']) & np.isin(df_test['user_id'],df_train['user_id']))
X_test_unseen = X_test[unseen_indices]
y_test_unseen = y_test[unseen_indices]
print(X_test_unseen.shape,y_test_unseen.shape,X_test.shape)
y_pred = model(X_test_unseen)
tf.keras.metrics.mean_squared_error(y_test_unseen, y_pred)

In [None]:
#Experiment - how new datapoints including seen users and movies interact
seen_indices = np.isin(df_test['movie_id'],df_train['movie_id']) & np.isin(df_test['user_id'],df_train['user_id'])
X_test_seen = X_test[seen_indices]
y_test_seen = y_test[seen_indices]
print(X_test_seen.shape,y_test_seen.shape,X_test.shape)
y_pred = model(X_test_seen)
tf.keras.metrics.mean_squared_error(y_test_seen, y_pred)

In [None]:
# simple model which simply spits out average rating for the movie. Assumes all users are average. Performs
#better than the neural net above
y_pred = []
avg_rating = df_train.groupby('movie_id').mean()
for movie_id in df_test['movie_id']:
    rating = avg_rating[avg_rating.index == movie_id]['rating'].to_numpy()
    if len(rating):
        y_pred.append(rating[0])
    else:
        y_pred.append(3)
        
r2_score(y_test,y_pred)

In [20]:
#changing the problem to a classification

X_train = df_train[['user_id','movie_id']].to_numpy() - 1
y_train = df_train['liked'].to_numpy()
X_test = df_test[['user_id','movie_id']].to_numpy() - 1
y_test = df_test['liked'].to_numpy()


In [21]:
model = collaborative_model(num_users,num_movie,64)

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),)
#     metrics=['accuracy'])

model.fit(X_train,y_train,32,40,validation_split=0.1,
callbacks=[tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)])    
    
umat = model.get_layer('users').get_weights()[0]
mmat = model.get_layer('movies').get_weights()[0]

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40


In [26]:
#Experiment - how new datapoints including unseen users and movies interact (cold start problem)
#holdout set
unseen_indices = ~(np.isin(df_test['movie_id'],df_train['movie_id']) & np.isin(df_test['user_id'],df_train['user_id']))
X_test_unseen = X_test[unseen_indices]
y_test_unseen = y_test[unseen_indices]
print(X_test_unseen.shape,y_test_unseen.shape,X_test.shape)
acc = tf.keras.metrics.Accuracy()
y_pred = tf.cast(model(X_test_unseen)>0 , tf.int32)
acc(y_pred,y_test_unseen)

(18300, 2) (18300,) (22015, 2)


<tf.Tensor: shape=(), dtype=float32, numpy=0.4947541>

In [27]:
#Experiment - how new datapoints including seen users and movies interact
seen_indices = np.isin(df_test['movie_id'],df_train['movie_id']) & np.isin(df_test['user_id'],df_train['user_id'])
X_test_seen = X_test[seen_indices]
y_test_seen = y_test[seen_indices]
print(X_test_seen.shape,y_test_seen.shape,X_test.shape)
acc = tf.keras.metrics.Accuracy()
y_pred = tf.cast(model(X_test_seen)>0 , tf.int32)
acc(y_pred,y_test_seen)

(3715, 2) (3715,) (22015, 2)


<tf.Tensor: shape=(), dtype=float32, numpy=0.8379542>