In [2]:
import os
import pprint
import tempfile
import itertools

from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [3]:
df_courses = pd.read_csv('/home/moseshubert/Documents/bangkit_project/dataset/course_recommender_dataset/courses.csv')
df_courses.rename(columns={'name':'course_name'}, inplace=True)
df_courses = df_courses[['course_name']]
df_courses.head()

Unnamed: 0,course_name
0,Belajar Fundamental Aplikasi Android
1,Belajar Membangun LINE Chatbot
2,Belajar Membuat Aplikasi Android untuk Pemula
3,Memulai Pemrograman Dengan Java
4,Memulai Pemrograman Dengan Kotlin


In [4]:
courses_dict = {
    name:np.array(value)
    for name, value in df_courses.items()
}

In [5]:
def slices(features):
  for i in itertools.count():
    # For each feature take index `i`
    example = {name:values[i] for name, values in features.items()}
    yield example

In [6]:
for example in slices(courses_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

course_name        : Belajar Fundamental Aplikasi Android


In [7]:
courses = tf.data.Dataset.from_tensor_slices(courses_dict)

In [8]:
courses

<TensorSliceDataset element_spec={'course_name': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [9]:
df_users = pd.read_csv('/home/moseshubert/Documents/bangkit_project/dataset/course_recommender_dataset/users.csv')
df_users.rename(columns={'id':'user_id'}, inplace=True)
df_users = df_users.astype({'user_id': str}, errors='raise')
df_users.head()

Unnamed: 0,user_id,course_name
0,378107,Menjadi Game Developer Expert
1,378107,Belajar Membuat Aplikasi Android untuk Pemula
2,378107,Belajar Membuat Game untuk Pemula
3,378107,Membangun Progressive Web Apps
4,378107,Belajar Dasar Pemrograman Web


In [10]:
df_users.dtypes

user_id        object
course_name    object
dtype: object

In [11]:
users_dict = {
    name:np.array(value)
    for name, value in df_users.items()
}

In [12]:
for example in slices(users_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

user_id            : 378107
course_name        : Menjadi Game Developer Expert


In [13]:
users = tf.data.Dataset.from_tensor_slices(users_dict)

In [14]:
users

<TensorSliceDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'course_name': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [15]:
print(type(courses))
print(type(users))

<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>


In [16]:
print(len(users))
print(len(courses))

4271
48


In [17]:
courses = courses.map(lambda x: x['course_name'])

users = users.map(lambda x: {
    'user_id': x['user_id'],
    'course_name': x['course_name']
})

In [18]:
courses

<MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [19]:
tf.random.set_seed(1234)
shuffled = users.shuffle(len(users), seed=1234, reshuffle_each_iteration=False)

train = shuffled.take(4_000)
test = shuffled.skip(4_000).take(271)

In [20]:
train

<TakeDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'course_name': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [21]:
course_names = courses.batch(25) # 25
user_ids = users.batch(40_000).map(lambda x: x['user_id']) # 40_000

In [22]:
course_names

<BatchDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>

In [23]:
# course_names = courses.batch(25) # 25
# user_ids = users.batch(40_000).map(lambda x: x['user_id']) # 40_000

unique_course_names = np.unique(np.concatenate(list(course_names)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_course_names[:10]

array([b'Architecting on AWS (Membangun Arsitektur Cloud di AWS)',
       b'Belajar Dasar Git dengan GitHub', b'Belajar Dasar Google Cloud',
       b'Belajar Dasar Pemrograman JavaScript',
       b'Belajar Dasar Pemrograman Web', b'Belajar Dasar UX Design',
       b'Belajar Dasar Visualisasi Data',
       b'Belajar Dasar-Dasar Azure Cloud',
       b'Belajar Fundamental Aplikasi Android',
       b'Belajar Fundamental Aplikasi Back-End'], dtype=object)

In [24]:
unique_user_ids[:10]

array([b'378101', b'378104', b'378107', b'378110', b'378113', b'378116',
       b'378119', b'378122', b'378125', b'378128'], dtype=object)

In [25]:
print(len(unique_user_ids))
print(len(unique_course_names))

635
48


In [26]:
embedding_dimension = 32

In [27]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [28]:
course_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_course_names, mask_token=None),
  tf.keras.layers.Embedding(len(unique_course_names) + 1, embedding_dimension),
  # tf.keras.layers.GRU(embedding_dimension)
])

In [29]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=courses.batch(16).map(course_model)
)

In [30]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [31]:
class Model(tfrs.Model):

  def __init__(self, user_model, course_model):
    super().__init__()
    self.course_model: tf.keras.Model = course_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the course features and pass them into the course model,
    # getting embeddings back.
    positive_course_embeddings = self.course_model(features["course_name"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_course_embeddings)

In [32]:
model = Model(user_model, course_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [33]:
cached_train = train.shuffle(len(users)).batch(400).cache()
cached_test = test.batch(40).cache()

In [34]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff658157b80>

In [35]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.06273062527179718,
 'factorized_top_k/top_5_categorical_accuracy': 0.2804428040981293,
 'factorized_top_k/top_10_categorical_accuracy': 0.5498154759407043,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 105.45509338378906,
 'regularization_loss': 0,
 'total_loss': 105.45509338378906}

In [36]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends courses out of the entire courses dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((courses.batch(10), courses.batch(10).map(model.course_model)))
)

user_id = 378101
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378101: [b'Menjadi Azure Cloud Developer' b'Belajar Dasar Pemrograman Web'
 b'Belajar Fundamental Aplikasi Flutter']


In [37]:
user_id = 378104
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378104: [b'Belajar Membuat Aplikasi Android untuk Pemula'
 b'Belajar Fundamental Aplikasi Android' b'Belajar Dasar UX Design']


In [38]:
user_id = 378107
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378107: [b'Belajar Membuat Aplikasi Back-End untuk Pemula'
 b'Belajar Dasar Pemrograman JavaScript'
 b'Belajar Dasar Git dengan GitHub']


In [39]:
user_id = 378110
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378110: [b'Memulai Pemrograman Dengan Java'
 b'Belajar Fundamental Aplikasi Android'
 b'Menjadi Android Developer Expert']


In [40]:
user_id = 378113
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378113: [b'Belajar Membuat Aplikasi Android untuk Pemula'
 b'Belajar Fundamental Aplikasi Android' b'Belajar Dasar UX Design']


In [41]:
unique_user_ids[:10]

array([b'378101', b'378104', b'378107', b'378110', b'378113', b'378116',
       b'378119', b'378122', b'378125', b'378128'], dtype=object)

In [48]:
unique_user_ids[600:]

array([b'380981', b'380990', b'380993', b'381005', b'381011', b'381014',
       b'381023', b'381026', b'381044', b'381047', b'381056', b'381059',
       b'381062', b'381068', b'381071', b'381077', b'381080', b'381083',
       b'381086', b'381089', b'381092', b'381095', b'381098', b'381101',
       b'381104', b'381107', b'381110', b'381113', b'381116', b'381125',
       b'381128', b'381137', b'381140', b'381143', b'381146'],
      dtype=object)

In [49]:
user_id = 381146
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381146: [b'Menjadi Android Developer Expert'
 b'Belajar Fundamental Aplikasi Flutter'
 b'Menjadi Front-End Web Developer Expert']


In [53]:
user_id = 381147
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381147: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [50]:
user_id = 381150
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381150: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [51]:
user_id = 381152
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381152: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [52]:
user_id = 381155
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381155: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']
