In [1]:
import os
import pprint
import tempfile
import itertools

from typing import Dict, Text

import math
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.preprocessing.sequence import pad_sequences

2022-06-02 11:30:53.961151: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-02 11:30:53.961191: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [72]:
# Global vars

MAX_HISTORY = 5

# Load both dataset and convert to tfds object

In [14]:
def slices(features):
  for i in itertools.count():
    # For each feature take index `i`
    example = {name:values[i] for name, values in features.items()}
    yield example

In [16]:
courses = pd.read_csv("courses.csv")
unique_course_names = courses["name"].tolist()

courses_dict = {
  "course_name": np.array(unique_course_names)
}

for example in slices(courses_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

tfds_courses = tf.data.Dataset.from_tensor_slices(courses_dict)
tfdsmap_courses = tfds_courses.map(lambda x: x['course_name'])
tfdsmap_courses

course_name        : Belajar Fundamental Aplikasi Android


<MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [32]:
# Load and clean data
useract = pd.read_csv("user_activities.csv")
useract = useract.drop(columns=['id'])
useract = useract.dropna(how='all')

def mergeall(rowdata):
    aggregated_datas = ",".join([str(s) for s in list(rowdata.values) if not pd.isna(s)])
    return aggregated_datas.split(",")

def mergecourse(rowdata):
    aggregated_datas = ",".join([str(s) for s in [rowdata.graduated_courses, rowdata.on_progress_courses] if not pd.isna(s)])
    return aggregated_datas.split(",")

merged_datas = list(useract.apply(mergeall, axis = 1))
merged_courses = list(useract.apply(mergecourse, axis = 1))

agumented_datas_dict = {
  'taken_courses': [],
  'recommendation': []
}

# for each merged course, 
#   for each course in merged course,
#     pick 1 as output, lainya jadi input
#     push as new entry
# TODO: 
#   This split is not too well-defined on merged courses with length longer than MAX_HISTORY.
#   This is because after permutating possible input, trailing courses are cut off at token padding anyway.

for i in range(len(merged_datas)):
  merged_data = merged_datas[i]
  merged_course = merged_courses[i]
    
  for course in merged_course:
    #Simpen dulu buat nanti kalo mau ditambahin ingfo non-courses
    #agumented_datas_dict['x'].append([data for data in merged_data if data != course])
    agumented_datas_dict['taken_courses'].append([data for data in merged_course if data != course])
    agumented_datas_dict['recommendation'].append(course)

for example in slices(agumented_datas_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

taken_courses      : ['Bangkit General Assessment', 'Belajar Dasar Pemrograman Web', 'Belajar Fundamental Aplikasi Android']
recommendation     : Bangkit Android Assessment


In [73]:
# Tokenize manual karena usecasenya agak aneh

def get_tokenizer_dict(train_list):
  return dict(zip(train_list, [i+1 for i in range(len(train_list))]))

def pad_tokens(input_sequences, maxlen):
  return np.array(pad_sequences(input_sequences, maxlen=maxlen, padding='pre'))

def tokenize(tokenizer_dict, corpus, maxlen = 5):
  output = []
  for line in corpus:
    tokenizedline = [tokenizer_dict.get(entry, 0) for entry in line]
    output.append(tokenizedline)
    
  return pad_tokens(output, maxlen)


tokenizer_dict = get_tokenizer_dict(unique_course_names)
tokenized_datas = tokenize(tokenizer_dict, agumented_datas_dict["taken_courses"], MAX_HISTORY)
tokenized_datas_dict = {
  'taken_courses': tokenized_datas,
  'recommendation': agumented_datas_dict["recommendation"]
}

for example in slices(tokenized_datas_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

tfds_tokenized_data = tf.data.Dataset.from_tensor_slices(tokenized_datas_dict)
tfdsmap_tokenized_data = tfds_tokenized_data.map(lambda x: {
  'taken_courses': x['taken_courses'],
  'recommendation': x['recommendation']
})
tfdsmap_tokenized_data

taken_courses      : [0 0 0 9 1]
recommendation     : Bangkit Android Assessment


<MapDataset element_spec={'taken_courses': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'recommendation': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [46]:
print(len(tfdsmap_courses))
print(len(tfdsmap_tokenized_data))

48
1719


In [48]:
tf.random.set_seed(69420)
shuffled = tfdsmap_tokenized_data.shuffle(len(tfdsmap_tokenized_data), seed=69420, reshuffle_each_iteration=False)

train_len = math.floor(len(tfdsmap_tokenized_data) * 0.9)
test_len = len(tfdsmap_tokenized_data) - train_len

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)

print(train, len(train))
print(test, len(test))

<TakeDataset element_spec={'taken_courses': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'recommendation': TensorSpec(shape=(), dtype=tf.string, name=None)}> 1547
<TakeDataset element_spec={'taken_courses': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'recommendation': TensorSpec(shape=(), dtype=tf.string, name=None)}> 172


In [63]:
embedding_dimension = 32

#https://www.tensorflow.org/recommenders/examples/multitask/

user_properties_model = tf.keras.Sequential([
  tf.keras.layers.Embedding(len(unique_course_names)+1, 32, input_length=5),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(embedding_dimension, activation='relu')
])

course_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_course_names, mask_token=None),
  tf.keras.layers.Embedding(len(unique_course_names) + 1, embedding_dimension),
  # tf.keras.layers.GRU(embedding_dimension)
])

task = tfrs.tasks.Retrieval(
  metrics=tfrs.metrics.FactorizedTopK(
    candidates=tfdsmap_courses.batch(16).map(course_model)
  )
)

In [64]:
class Model(tfrs.Model):
  def __init__(self, user_model, course_model):
    super().__init__()
    self.course_model: tf.keras.Model = course_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[tf.Tensor, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["taken_courses"])

    # And pick out the course features and pass them into the course model,
    # getting embeddings back.
    positive_course_embeddings = self.course_model(features["recommendation"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_course_embeddings)

In [65]:
model = Model(user_properties_model, course_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(len(tfdsmap_tokenized_data)).batch(400).cache()
cached_test = test.batch(40).cache()

user_properties_model.summary()
course_model.summary()


Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 5, 32)             1568      
                                                                 
 flatten (Flatten)           (None, 160)               0         
                                                                 
 dense_12 (Dense)            (None, 100)               16100     
                                                                 
 dense_13 (Dense)            (None, 32)                3232      
                                                                 
Total params: 20,900
Trainable params: 20,900
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 string_lookup_7 (StringLo

In [67]:

model.fit(cached_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

2022-06-02 14:34:42.643261: W tensorflow/core/data/root_dataset.cc:247] Optimization loop failed: CANCELLED: Operation was cancelled


Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fab1044d760>

In [68]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.06395348906517029,
 'factorized_top_k/top_5_categorical_accuracy': 0.5058139562606812,
 'factorized_top_k/top_10_categorical_accuracy': 0.6511628031730652,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 48.07685089111328,
 'regularization_loss': 0,
 'total_loss': 48.07685089111328}

# Making predicitons


In [69]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# recommends courses out of the entire courses dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((
    tfdsmap_courses.batch(10), 
    tfdsmap_courses.batch(10).map(model.course_model)
  ))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fab10619af0>

In [88]:

# Get recommendations.
taken_courses = [
  "Belajar Dasar Pemrograman Web,Meniti Karier sebagai Software Developer",
  "Belajar Membuat Augmented Reality dengan Lens Studio",
  "Belajar Dasar Pemrograman JavaScript",
  "Belajar Fundamental Front-End Web Development",
  "Belajar Membuat Aplikasi Android untuk Pemula"
]
inputdata = tokenize(tokenizer_dict, [taken_courses], MAX_HISTORY)
print(inputdata)

_, course_names = index(inputdata)
print("Recommendation:")
for course_name in course_names[0, :4]:
  tf.print(course_name)

[[ 0 38 30 15  3]]
Recommendation:
Memulai Pemrograman Dengan Java
Belajar Dasar UX Design
Belajar Membuat Aplikasi Back-End untuk Pemula dengan Google Cloud
Belajar Fundamental Aplikasi Back-End
