In [None]:
from functools import partial
import os
from pathlib import Path
import requests
import zipfile

import numpy as np
import polars as pl
import tensorflow as tf
import keras

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


DATASET_LOCATION='https://files.grouplens.org/datasets/movielens/ml-latest.zip'
DATASET_HASH='https://files.grouplens.org/datasets/movielens/ml-latest.zip.md5'
dst_d = Path.cwd() / '..' / 'data' 

In [None]:
def download_lens_data(source,destination):
    """Downloads movie lens data
    """
    dst_f = Path(destination) / Path(source).name
    r = requests.get(source)
    if r.ok:
        with open(dst_f,"wb") as f:
            f.write(r.content)
    return dst_f

downloaded_files = download_lens_data(DATASET_LOCATION, dst_d)

def unzip_files(zipped_file,destination):
    """Unzips a movie lens data set file
    """
    zip_dir = Path(zipped_file).stem
    needed_files = ["ratings.csv", "movies.csv"]
    needed_files = [ Path(zip_dir) / Path(f) for f in needed_files]
    needed_files = [ Path(destination) / f for f in needed_files ]
    
    with zipfile.ZipFile(zipped_file,'r') as z:
        z.extractall(destination)
    return needed_files

ratings, movies = unzip_files(downloaded_files, dst_d)
[os.path.exists(f) for f in (ratings, movies)]

In [None]:
ratings_df = pl.read_csv(ratings)
ratings_df.drop_in_place("timestamp")
ratings_df = ratings_df.with_columns(pl.col('movieId').rank(method="dense").alias("movie_encoding")-1)
ratings_df = ratings_df.with_columns(pl.col('userId').rank(method="dense").alias("user_encoding")-1)
ratings_df.head()

In [None]:
n_users = ratings_df.n_unique(subset=["user_encoding"])
n_movies = ratings_df.n_unique(subset=["movie_encoding"])
max_rating = ratings_df['rating'].max()
min_rating = ratings_df['rating'].min()

print(n_users, n_movies, max_rating, min_rating)

X = ratings_df[['user_encoding', 'movie_encoding']].to_numpy()
y = ratings_df['rating'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=50)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]
y_train = (y_train - min_rating)/(max_rating - min_rating)
y_test = (y_test - min_rating)/(max_rating - min_rating)
n_factors = 100

In [5]:
## Initializing a input layer for users
user = tf.keras.layers.Input(shape = (1,))

## Embedding layer for n_factors of users
u = tf.keras.layers.Embedding(n_users, n_factors, embeddings_initializer = 'he_normal', embeddings_regularizer = tf.keras.regularizers.l2(1e-6))(user)
u = tf.keras.layers.Reshape((n_factors,))(u)

## Initializing a input layer for movies
movie = tf.keras.layers.Input(shape = (1,))

## Embedding layer for n_factors of movies
m = tf.keras.layers.Embedding(n_movies, n_factors, embeddings_initializer = 'he_normal', embeddings_regularizer=tf.keras.regularizers.l2(1e-6))(movie)
m = tf.keras.layers.Reshape((n_factors,))(m)

## stacking up both user and movie embeddings
x = tf.keras.layers.Concatenate()([u,m])
x = tf.keras.layers.Dropout(0.05)(x)

## Adding a Dense layer to the architecture
x = tf.keras.layers.Dense(32, kernel_initializer='he_normal')(x)
x = tf.keras.layers.Activation(activation='relu')(x)
x = tf.keras.layers.Dropout(0.05)(x)

x = tf.keras.layers.Dense(16, kernel_initializer='he_normal')(x)
x = tf.keras.layers.Activation(activation='relu')(x)
x = tf.keras.layers.Dropout(0.05)(x)

## Adding an Output layer with Sigmoid activation function which gives output between 0 and 1
x = tf.keras.layers.Dense(9)(x)
x = tf.keras.layers.Activation(activation='softmax')(x)

## Adding a Lambda layer to convert the output to rating by scaling it with the help of available rating information
# x = tf.keras.layers.Lambda(lambda x: x*(max_rating - min_rating) + min_rating)(x)

## Defining the model
model = tf.keras.models.Model(inputs=[user,movie], outputs=x)
# optimizer = tf.keras.optimizers.Adam(lr=0.001)
# optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.005,
    # rho=0.9, momentum=0.01, epsilon=1e-07)

## Compiling the model

model.compile(optimizer='sgd', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=3, min_lr=0.000001, verbose=1)

history = model.fit(x = X_train_array, y = y_train, batch_size=4096, epochs=70, verbose=1, validation_data=(X_test_array, y_test)
,shuffle=True,callbacks=[reduce_lr])
