In [2]:
import os
import pprint
import tempfile
import itertools

from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

2022-05-31 13:11:37.568142: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-31 13:11:37.568183: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Load both dataset and convert to tfds object

In [51]:
df_courses = pd.read_csv('./courses.csv')
df_courses.rename(columns={'name':'course_name'}, inplace=True)
df_courses = df_courses[['course_name']]
df_courses.head()

Unnamed: 0,course_name
0,Belajar Fundamental Aplikasi Android
1,Belajar Membangun LINE Chatbot
2,Belajar Membuat Aplikasi Android untuk Pemula
3,Memulai Pemrograman Dengan Java
4,Memulai Pemrograman Dengan Kotlin


In [52]:
# Convert courses to dictionary
courses_dict = {
  name: np.array(value) for name, value in df_courses.items()
}

def slices(features):
  for i in itertools.count():
    # For each feature take index `i`
    example = {name:values[i] for name, values in features.items()}
    yield example

for example in slices(courses_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

courses = tf.data.Dataset.from_tensor_slices(courses_dict)
courses

course_name        : Belajar Fundamental Aplikasi Android


<TensorSliceDataset element_spec={'course_name': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [53]:
df_users = pd.read_csv('./users.csv')
df_users.rename(columns={'id':'user_id'}, inplace=True)
df_users = df_users.astype({'user_id': str}, errors='raise')
df_users.head()

Unnamed: 0,user_id,course_name
0,378107,Menjadi Game Developer Expert
1,378107,Belajar Membuat Aplikasi Android untuk Pemula
2,378107,Belajar Membuat Game untuk Pemula
3,378107,Membangun Progressive Web Apps
4,378107,Belajar Dasar Pemrograman Web


In [54]:
users_dict = {
  name:np.array(value) for name, value in df_users.items()
}

for example in slices(users_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

users = tf.data.Dataset.from_tensor_slices(users_dict)
users


user_id            : 378107
course_name        : Menjadi Game Developer Expert


<TensorSliceDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'course_name': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [13]:
print(type(courses))
print(type(users))

<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>


In [14]:
print(len(users))
print(len(courses))

4271
48


In [55]:
# Convert to map dataset
# TODO: Kayaknya yang users gak merupbah apa apa? 

courses = courses.map(lambda x: x['course_name'])

users = users.map(lambda x: {
    'user_id': x['user_id'],
    'course_name': x['course_name']
})

print(type(courses))
print(type(users))

<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>
<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [16]:
tf.random.set_seed(1234)
shuffled = users.shuffle(len(users), seed=1234, reshuffle_each_iteration=False)

train = shuffled.take(4_000)
test = shuffled.skip(4_000).take(271)

print(train)
print(test)

<TakeDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'course_name': TensorSpec(shape=(), dtype=tf.string, name=None)}>
<TakeDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'course_name': TensorSpec(shape=(), dtype=tf.string, name=None)}>


In [17]:
course_names = courses.batch(25) # 25
user_ids = users.batch(40_000).map(lambda x: x['user_id']) # 40_000

In [18]:
course_names

<BatchDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>

In [19]:
# course_names = courses.batch(25) # 25
# user_ids = users.batch(40_000).map(lambda x: x['user_id']) # 40_000

unique_course_names = np.unique(np.concatenate(list(course_names)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_course_names[:10]

array([b'Architecting on AWS (Membangun Arsitektur Cloud di AWS)',
       b'Belajar Dasar Git dengan GitHub', b'Belajar Dasar Google Cloud',
       b'Belajar Dasar Pemrograman JavaScript',
       b'Belajar Dasar Pemrograman Web', b'Belajar Dasar UX Design',
       b'Belajar Dasar Visualisasi Data',
       b'Belajar Dasar-Dasar Azure Cloud',
       b'Belajar Fundamental Aplikasi Android',
       b'Belajar Fundamental Aplikasi Back-End'], dtype=object)

In [20]:
print(len(unique_user_ids))
print(len(unique_course_names))
unique_user_ids[:10]

635
48


array([b'378101', b'378104', b'378107', b'378110', b'378113', b'378116',
       b'378119', b'378122', b'378125', b'378128'], dtype=object)

In [56]:
embedding_dimension = 32

#https://www.tensorflow.org/recommenders/examples/multitask/

user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

course_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_course_names, mask_token=None),
  tf.keras.layers.Embedding(len(unique_course_names) + 1, embedding_dimension),
  # tf.keras.layers.GRU(embedding_dimension)
])

task = tfrs.tasks.Retrieval(
  metrics=tfrs.metrics.FactorizedTopK(
    candidates=courses.batch(16).map(course_model)
  )
)

In [24]:
class Model(tfrs.Model):
  def __init__(self, user_model, course_model):
    super().__init__()
    self.course_model: tf.keras.Model = course_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])

    # And pick out the course features and pass them into the course model,
    # getting embeddings back.
    positive_course_embeddings = self.course_model(features["course_name"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_course_embeddings)

In [29]:
model = Model(user_model, course_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(len(users)).batch(400).cache()
cached_test = test.batch(40).cache()

model.fit(cached_train, epochs=3)

In [38]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.04428044334053993,
 'factorized_top_k/top_5_categorical_accuracy': 0.31734317541122437,
 'factorized_top_k/top_10_categorical_accuracy': 0.6236162185668945,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 110.62801361083984,
 'regularization_loss': 0,
 'total_loss': 110.62801361083984}

# Making predicitons


In [46]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# recommends courses out of the entire courses dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((courses.batch(10), courses.batch(10).map(model.course_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fd436047040>

In [62]:
inputdata = np.array([str(378101)])
print(model.user_model.predict(inputdata))
print(model.course_model.predict(np.array(["Belajar Fundamental Aplikasi Android"])))


[[-0.29238877  0.1896909   0.24672015 -0.18970405  0.31245732  0.48803166
   0.19648258 -0.63713247  0.32725298 -0.12039973  0.18609333  0.13593087
   0.57613707  0.01277212 -0.07386976  0.0501121  -0.10746235 -0.1267919
  -0.0871425   0.02395872  0.53917813  0.0156215   0.16900739 -0.21539979
  -0.38051724 -0.3160193   0.57798487 -0.05177205  0.09948194 -0.01651384
  -0.08244672 -0.44000137]]
[[-1.317824    0.1821487   0.11608754  0.8123598   1.0295552   0.75508887
  -0.29398802 -0.46622536  0.34544238  0.4443783   0.11546618 -0.4720895
   0.7208403  -0.6527897   0.5723107  -0.35173348 -0.2510147  -0.6380948
  -0.09940079  0.7515651   0.5717442   0.5541475  -0.5287965  -0.866028
  -0.815331    0.06582449  0.555884    0.90853626 -0.102878    0.5781376
  -0.02559072 -1.4059564 ]]


In [None]:
user_id = 378101
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

In [None]:
user_id = 378104

_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378104: [b'Belajar Membuat Aplikasi Android untuk Pemula'
 b'Belajar Fundamental Aplikasi Android' b'Belajar Dasar UX Design']


In [None]:
user_id = 378107
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378107: [b'Belajar Membuat Aplikasi Back-End untuk Pemula'
 b'Belajar Dasar Pemrograman JavaScript'
 b'Belajar Dasar Git dengan GitHub']


In [None]:
user_id = 378110
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378110: [b'Memulai Pemrograman Dengan Java'
 b'Belajar Fundamental Aplikasi Android'
 b'Menjadi Android Developer Expert']


In [None]:
user_id = 378113
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378113: [b'Belajar Membuat Aplikasi Android untuk Pemula'
 b'Belajar Fundamental Aplikasi Android' b'Belajar Dasar UX Design']


In [None]:
unique_user_ids[:10]

array([b'378101', b'378104', b'378107', b'378110', b'378113', b'378116',
       b'378119', b'378122', b'378125', b'378128'], dtype=object)

In [None]:
unique_user_ids[600:]

array([b'380981', b'380990', b'380993', b'381005', b'381011', b'381014',
       b'381023', b'381026', b'381044', b'381047', b'381056', b'381059',
       b'381062', b'381068', b'381071', b'381077', b'381080', b'381083',
       b'381086', b'381089', b'381092', b'381095', b'381098', b'381101',
       b'381104', b'381107', b'381110', b'381113', b'381116', b'381125',
       b'381128', b'381137', b'381140', b'381143', b'381146'],
      dtype=object)

In [None]:
user_id = 381146
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381146: [b'Menjadi Android Developer Expert'
 b'Belajar Fundamental Aplikasi Flutter'
 b'Menjadi Front-End Web Developer Expert']


In [None]:
user_id = 381147
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381147: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [None]:
user_id = 381150
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381150: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [None]:
user_id = 381152
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381152: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [None]:
user_id = 381155
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381155: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']
