In [28]:
import os
import pprint
import tempfile
import itertools

from typing import Dict, Text

import math
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [70]:
# Global vars

MAX_HISTORY = 5

# Load both dataset and convert to tfds object

In [71]:
def slices(features):
  for i in itertools.count():
    # For each feature take index `i`
    example = {name:values[i] for name, values in features.items()}
    yield example

## Load courses list
TODO: Course list disini sepertinya rada galengkap, token dari course" bangkit jadi nol semua

Output berupa objek MapDataset

In [72]:
courses = pd.read_csv("courses.csv")
unique_course_names = courses["name"].tolist()
unique_course_set = set(unique_course_names)

courses_dict = {
  "course_name": np.array(unique_course_names)
}

for example in slices(courses_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

tfds_courses = tf.data.Dataset.from_tensor_slices(courses_dict)
tfdsmap_courses = tfds_courses.map(lambda x: x['course_name'])
tfdsmap_courses

course_name        : Belajar Fundamental Aplikasi Android


<MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [73]:

def get_tokenizer_dict(train_list):
  return dict(zip(train_list, [i+1 for i in range(len(train_list))]))

get_tokenizer_dict(unique_course_names)

{'Belajar Fundamental Aplikasi Android': 1,
 'Belajar Membangun LINE Chatbot': 2,
 'Belajar Membuat Aplikasi Android untuk Pemula': 3,
 'Memulai Pemrograman Dengan Java': 4,
 'Memulai Pemrograman Dengan Kotlin': 5,
 'Menjadi Azure Cloud Developer': 6,
 'Memulai Pemrograman Dengan Python': 7,
 'Memulai Pemrograman Dengan C': 8,
 'Belajar Dasar Pemrograman Web': 9,
 'Menjadi Google Cloud Engineer': 10,
 'Belajar Dasar-Dasar Azure Cloud': 11,
 'Memulai Pemrograman Dengan Swift': 12,
 'Belajar Membangun LINE Front-end Framework (LIFF)': 13,
 'Belajar Membuat Aplikasi Flutter untuk Pemula': 14,
 'Belajar Fundamental Front-End Web Development': 15,
 'Menjadi Android Developer Expert': 16,
 'Belajar Prinsip Pemrograman SOLID': 17,
 'Belajar Membuat Aplikasi iOS untuk Pemula': 18,
 'Belajar Dasar Visualisasi Data': 19,
 'Belajar Machine Learning untuk Pemula': 20,
 'Belajar Pengembangan Machine Learning': 21,
 'Memulai Pemrograman Dengan Dart': 22,
 'Belajar Fundamental Aplikasi Flutter': 23,


In [74]:
# Load and clean data
useract = pd.read_csv("user_activities.csv")
useract = useract.drop(columns=['id'])
useract = useract.dropna(how='all')

def mergeall(rowdata):
    aggregated_datas = ",".join([str(s) for s in list(rowdata.values) if not pd.isna(s)])
    return aggregated_datas.split(",")

def mergecourse(rowdata):
    aggregated_datas = ",".join([str(s) for s in [rowdata.graduated_courses, rowdata.on_progress_courses] if not pd.isna(s)])
    return aggregated_datas.split(",")

merged_datas = list(useract.apply(mergeall, axis = 1))
merged_courses = list(useract.apply(mergecourse, axis = 1))

agumented_datas_dict = {
  'taken_courses': [],
  'recommendation': []
}

# for each merged course, 
#   for each course in merged course,
#     pick 1 as output, lainya jadi input
#     push as new entry
# TODO: 
#   This split is not too well-defined on merged courses with length longer than MAX_HISTORY.
#   This is because after permutating possible input, trailing courses are cut off at token padding anyway.

for i in range(len(merged_datas)):
  merged_data = merged_datas[i]
  merged_course = [course for course in merged_courses[i] if course in unique_course_set] 
  
  # Sliding window sebesar MAX_HISTORY + 1 untuk dimasukkan permutasi output
  for j in range(max(len(merged_course)-MAX_HISTORY+2, 1)):
    merged_course_window = merged_course[j: min(len(merged_course), MAX_HISTORY + j +1)]

    # Permutasi data dalam window sebagai output. Untuk setiap n dalam S, Input: S-exclude-n, Output: n 
    for course in merged_course_window:
      #Simpen dulu buat nanti kalo mau ditambahin ingfo non-courses
      #agumented_datas_dict['x'].append([data for data in merged_data if data != course])
      x = [data for data in merged_course_window if data != course]
      y = course
      agumented_datas_dict['taken_courses'].append(x)
      agumented_datas_dict['recommendation'].append(y)

for example in slices(agumented_datas_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

taken_courses      : ['Belajar Fundamental Aplikasi Android']
recommendation     : Belajar Dasar Pemrograman Web


## Load user activities list
User Activities merujuk pada semua course yang pernah dan sedang diambil. Asumsinya adalah kalau banyak user yang ambil bebarengan, maka seharusnya course" tsb berkaitan.

Augmentasi data dilakukan sbb:
```
Jika course yang pernah diambil adalah [a, b, c, d, e]
output akan diaugmentasi menjadi:
x             | y
--------------+----
[a, b, c, d]  | e
[a, b, c, e]  | d
[a, b, d, e]  | c
[a, c, d, e]  | b
[b, c, d, e]  | a

```

Output berupa objek MapDataset

In [75]:
# Tokenize manual karena usecasenya agak aneh

def get_tokenizer_dict(train_list):
  return dict(zip(train_list, [i+1 for i in range(len(train_list))]))

def pad_tokens(input_sequences, maxlen):
  return np.array(pad_sequences(input_sequences, maxlen=maxlen, padding='pre'))

def tokenize(tokenizer_dict, corpus, maxlen = 5):
  output = []
  for line in corpus:
    tokenizedline = [tokenizer_dict.get(entry, 0) for entry in line]
    output.append(tokenizedline)
    
  return pad_tokens(output, maxlen)


tokenizer_dict = get_tokenizer_dict(unique_course_names)
tokenized_datas = tokenize(tokenizer_dict, agumented_datas_dict["taken_courses"], MAX_HISTORY)
tokenized_datas_dict = {
  'taken_courses': tokenized_datas,
  'recommendation': agumented_datas_dict["recommendation"]
}

for example in slices(tokenized_datas_dict):
  for name, value in example.items():
    print(f"{name:19s}: {value}")
  break

tfds_tokenized_data = tf.data.Dataset.from_tensor_slices(tokenized_datas_dict)
tfdsmap_tokenized_data = tfds_tokenized_data.map(lambda x: {
  'taken_courses': x['taken_courses'],
  'recommendation': x['recommendation']
})
tfdsmap_tokenized_data

taken_courses      : [0 0 0 0 1]
recommendation     : Belajar Dasar Pemrograman Web


<MapDataset element_spec={'taken_courses': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'recommendation': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [76]:
print(len(tfdsmap_courses))
print(len(tfdsmap_tokenized_data))

48
2881


In [77]:
tf.random.set_seed(42069)
shuffled = tfdsmap_tokenized_data.shuffle(len(tfdsmap_tokenized_data), seed=42069, reshuffle_each_iteration=False)

train_len = math.floor(len(tfdsmap_tokenized_data) * 0.9)
test_len = len(tfdsmap_tokenized_data) - train_len

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)

print(train, len(train))
print(test, len(test))

<TakeDataset element_spec={'taken_courses': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'recommendation': TensorSpec(shape=(), dtype=tf.string, name=None)}> 2592
<TakeDataset element_spec={'taken_courses': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'recommendation': TensorSpec(shape=(), dtype=tf.string, name=None)}> 289


In [78]:
embedding_dimension = 32

#https://www.tensorflow.org/recommenders/examples/multitask/

user_properties_model = tf.keras.Sequential([
  tf.keras.layers.Embedding(len(unique_course_names)+1, 32, input_length=MAX_HISTORY),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(150, activation='relu'),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(150, activation='relu'),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(embedding_dimension, activation='relu')
])

course_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_course_names, mask_token=None),
  tf.keras.layers.Embedding(len(unique_course_names) + 1, embedding_dimension),
])

task = tfrs.tasks.Retrieval(
  metrics=tfrs.metrics.FactorizedTopK(
    candidates=tfdsmap_courses.batch(16).map(course_model)
  )
)

In [79]:
class Model(tfrs.Model):
  def __init__(self, user_model, course_model):
    super().__init__()
    self.course_model: tf.keras.Model = course_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[tf.Tensor, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["taken_courses"])

    # And pick out the course features and pass them into the course model,
    # getting embeddings back.
    positive_course_embeddings = self.course_model(features["recommendation"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_course_embeddings)

In [80]:
model = Model(user_properties_model, course_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.15))

cached_train = train.shuffle(len(tfdsmap_tokenized_data)).batch(400).cache()
cached_test = test.batch(40).cache()

user_properties_model.summary()
course_model.summary()


Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 5, 32)             1568      
                                                                 
 flatten_6 (Flatten)         (None, 160)               0         
                                                                 
 dense_24 (Dense)            (None, 150)               24150     
                                                                 
 dropout_12 (Dropout)        (None, 150)               0         
                                                                 
 dense_25 (Dense)            (None, 150)               22650     
                                                                 
 dropout_13 (Dropout)        (None, 150)               0         
                                                                 
 dense_26 (Dense)            (None, 100)             

In [81]:
model.fit(cached_train, epochs=70)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x7ff48f086aa0>

In [82]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.15570934116840363,
 'factorized_top_k/top_5_categorical_accuracy': 0.5155709385871887,
 'factorized_top_k/top_10_categorical_accuracy': 0.6505190134048462,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 11.986360549926758,
 'regularization_loss': 0,
 'total_loss': 11.986360549926758}

# Making predicitons


In [83]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# recommends courses out of the entire courses dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((
    tfdsmap_courses.batch(10), 
    tfdsmap_courses.batch(10).map(model.course_model)
  ))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7ff4cb77a950>

In [84]:
taken_courses = []
inputdata = tokenize(tokenizer_dict, [taken_courses], MAX_HISTORY)
print(inputdata)
_, course_names = index(inputdata)
print("Recommendation:")
for course_name in course_names[0, :5]:
  tf.print(course_name)

[[0 0 0 0 0]]
Recommendation:
Belajar Membuat Aplikasi Android untuk Pemula
Memulai Pemrograman Dengan Java
Belajar Fundamental Aplikasi Android
Belajar Membangun LINE Chatbot
Belajar Dasar Pemrograman Web


In [91]:

# Get recommendations.
taken_courses = [
  "Cloud Practitioner Essentials (Belajar Dasar AWS Cloud)",
  "Meniti Karier sebagai Software Developer",
  "Belajar Dasar Pemrograman Web",
  "Architecting on AWS (Membangun Arsitektur Cloud di AWS)",
  "Belajar Dasar Git dengan GitHub",
]
inputdata = tokenize(tokenizer_dict, [taken_courses], MAX_HISTORY)
print(inputdata)

_, course_names = index(inputdata)
print("Recommendation:")
for course_name in course_names[0, :5]:
  tf.print(course_name)

[[29 35  9 32 40]]
Recommendation:
Menjadi Android Developer Expert
Menjadi Flutter Developer Expert
Memulai Dasar Pemrograman untuk Menjadi Pengembang Software
Menjadi Front-End Web Developer Expert
Menjadi Azure Cloud Developer


In [18]:
tf.saved_model.save(index, "./saved_index")



INFO:tensorflow:Assets written to: ./saved_index/assets


INFO:tensorflow:Assets written to: ./saved_index/assets


In [86]:
# load it back; can also be done in TensorFlow Serving.
loaded = tf.saved_model.load("./saved_index")

# pass a user id in, get top predicted movie titles back.
scores, titles = loaded(inputdata)

_, course_names = loaded(inputdata)
print("Recommendation:")
for course_name in course_names[0, :5]:
  tf.print(course_name)

Recommendation:
Belajar Fundamental Aplikasi iOS
Memulai Pemrograman Dengan Swift
Belajar Fundamental Aplikasi Back-End
Memulai Pemrograman Dengan Dart
Belajar Dasar Visualisasi Data
