In [36]:
import os
import pprint
import tempfile
import itertools

from typing import Dict, Text

import math
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load both dataset and convert to tfds object

In [44]:
courses = pd.read_csv("courses.csv")
unique_course_names = courses["name"].tolist()

In [45]:
# Load and clean data
useract = pd.read_csv("user_activities.csv")
useract = useract.drop(columns=['id'])
useract = useract.dropna(how='all')

def mergeall(rowdata):
    aggregated_datas = ",".join([str(s) for s in list(rowdata.values) if not pd.isna(s)])
    return aggregated_datas.split(",")

def mergecourse(rowdata):
    aggregated_datas = ",".join([str(s) for s in [rowdata.graduated_courses, rowdata.on_progress_courses] if not pd.isna(s)])
    return aggregated_datas.split(",")

merged_datas = list(useract.apply(mergeall, axis = 1))
merged_courses = list(useract.apply(mergecourse, axis = 1))

agumented_datas = {
  'x': [],
  'y': []
}

for i in range(len(merged_datas)):
  merged_data = merged_datas[i]
  merged_course = merged_courses[i]
    
  for course in merged_course:
    #Simpen dulu buat nanti kalo mau ditambahin ingfo non-courses
    #agumented_datas['x'].append([data for data in merged_data if data != course])
    agumented_datas['x'].append([data for data in merged_course if data != course])
    agumented_datas['y'].append(course)

agumented_datas

{'x': [['Bangkit General Assessment',
   'Belajar Dasar Pemrograman Web',
   'Belajar Fundamental Aplikasi Android'],
  ['Bangkit Android Assessment',
   'Belajar Dasar Pemrograman Web',
   'Belajar Fundamental Aplikasi Android'],
  ['Bangkit Android Assessment',
   'Bangkit General Assessment',
   'Belajar Fundamental Aplikasi Android'],
  ['Bangkit Android Assessment',
   'Bangkit General Assessment',
   'Belajar Dasar Pemrograman Web'],
  [],
  ['Belajar Membuat Aplikasi Android untuk Pemula',
   'Belajar Membuat Game untuk Pemula',
   'Membangun Progressive Web Apps',
   'Belajar Dasar Pemrograman Web',
   'Belajar Fundamental Front-End Web Development',
   'Menjadi Front-End Web Developer Expert',
   'Cloud Practitioner Essentials (Belajar Dasar AWS Cloud)',
   'Belajar Dasar Pemrograman JavaScript',
   'Belajar Membuat Aplikasi Back-End untuk Pemula',
   'Belajar Membuat Augmented Reality dengan Lens Studio'],
  ['Menjadi Game Developer Expert',
   'Belajar Membuat Game untuk Pem

In [46]:
# Tokenize manual karena usecasenya agak aneh

def get_tokenizer_dict(train_list):
    return dict(zip(train_list, [i+1 for i in range(len(train_list))]))

def tokenize(tokenizer_dict, corpus):
    output = []
    for line in corpus:
        tokenizedline = [tokenizer_dict.get(entry, 0) for entry in line]
        output.append(tokenizedline)
    
    return output

def pad_tokens(input_sequences, maxlen):
    return np.array(pad_sequences(input_sequences, maxlen=maxlen, padding='pre'))

tokenizer_dict = get_tokenizer_dict(coursesname)
agumented_datas['x'] = pad_tokens(tokenize(tokenizer_dict, agumented_datas["x"]), 5)
agumented_datas['x']

array([[ 0,  0,  0,  9,  1],
       [ 0,  0,  0,  9,  1],
       [ 0,  0,  0,  0,  1],
       ...,
       [ 0, 30, 31,  0,  7],
       [ 0, 30, 31,  0, 29],
       [ 0,  0,  0,  0,  0]], dtype=int32)

In [47]:
print(len(agumented_datas['x']))
print(len(agumented_datas['y']))

1719
1719


In [48]:
ds = tf.data.Dataset.from_tensor_slices(agumented_datas)
ds

<TensorSliceDataset element_spec={'x': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'y': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [49]:


tf.random.set_seed(69420)
shuffled = ds.shuffle(len(ds), seed=69420, reshuffle_each_iteration=False)

train_len = math.floor(len(ds) * 0.9)
test_len = len(ds) - train_len

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)

print(train, len(train))
print(test, len(test))

<TakeDataset element_spec={'x': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'y': TensorSpec(shape=(), dtype=tf.string, name=None)}> 1547
<TakeDataset element_spec={'x': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'y': TensorSpec(shape=(), dtype=tf.string, name=None)}> 172


In [51]:
embedding_dimension = 32

#https://www.tensorflow.org/recommenders/examples/multitask/

user_properties_model = tf.keras.Sequential([
  tf.keras.layers.Embedding(len(unique_course_names)+1, 32, input_length=5),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(embedding_dimension, activation='relu')
])

course_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_course_names, mask_token=None),
  tf.keras.layers.Embedding(len(unique_course_names) + 1, embedding_dimension),
  # tf.keras.layers.GRU(embedding_dimension)
])

task = tfrs.tasks.Retrieval(
  metrics=tfrs.metrics.FactorizedTopK(
    candidates=courses.batch(16).map(course_model)
  )
)

AttributeError: 'DataFrame' object has no attribute 'batch'

In [24]:
class Model(tfrs.Model):
  def __init__(self, user_model, course_model):
    super().__init__()
    self.course_model: tf.keras.Model = course_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])

    # And pick out the course features and pass them into the course model,
    # getting embeddings back.
    positive_course_embeddings = self.course_model(features["course_name"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_course_embeddings)

In [29]:
model = Model(user_model, course_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(len(users)).batch(400).cache()
cached_test = test.batch(40).cache()

model.fit(cached_train, epochs=3)

In [38]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.04428044334053993,
 'factorized_top_k/top_5_categorical_accuracy': 0.31734317541122437,
 'factorized_top_k/top_10_categorical_accuracy': 0.6236162185668945,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 110.62801361083984,
 'regularization_loss': 0,
 'total_loss': 110.62801361083984}

# Making predicitons


In [46]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# recommends courses out of the entire courses dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((courses.batch(10), courses.batch(10).map(model.course_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fd436047040>

In [62]:
inputdata = np.array([str(378101)])
print(model.user_model.predict(inputdata))
print(model.course_model.predict(np.array(["Belajar Fundamental Aplikasi Android"])))


[[-0.29238877  0.1896909   0.24672015 -0.18970405  0.31245732  0.48803166
   0.19648258 -0.63713247  0.32725298 -0.12039973  0.18609333  0.13593087
   0.57613707  0.01277212 -0.07386976  0.0501121  -0.10746235 -0.1267919
  -0.0871425   0.02395872  0.53917813  0.0156215   0.16900739 -0.21539979
  -0.38051724 -0.3160193   0.57798487 -0.05177205  0.09948194 -0.01651384
  -0.08244672 -0.44000137]]
[[-1.317824    0.1821487   0.11608754  0.8123598   1.0295552   0.75508887
  -0.29398802 -0.46622536  0.34544238  0.4443783   0.11546618 -0.4720895
   0.7208403  -0.6527897   0.5723107  -0.35173348 -0.2510147  -0.6380948
  -0.09940079  0.7515651   0.5717442   0.5541475  -0.5287965  -0.866028
  -0.815331    0.06582449  0.555884    0.90853626 -0.102878    0.5781376
  -0.02559072 -1.4059564 ]]


In [None]:
user_id = 378101
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

In [None]:
user_id = 378104

_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378104: [b'Belajar Membuat Aplikasi Android untuk Pemula'
 b'Belajar Fundamental Aplikasi Android' b'Belajar Dasar UX Design']


In [None]:
user_id = 378107
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378107: [b'Belajar Membuat Aplikasi Back-End untuk Pemula'
 b'Belajar Dasar Pemrograman JavaScript'
 b'Belajar Dasar Git dengan GitHub']


In [None]:
user_id = 378110
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378110: [b'Memulai Pemrograman Dengan Java'
 b'Belajar Fundamental Aplikasi Android'
 b'Menjadi Android Developer Expert']


In [None]:
user_id = 378113
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 378113: [b'Belajar Membuat Aplikasi Android untuk Pemula'
 b'Belajar Fundamental Aplikasi Android' b'Belajar Dasar UX Design']


In [None]:
unique_user_ids[:10]

array([b'378101', b'378104', b'378107', b'378110', b'378113', b'378116',
       b'378119', b'378122', b'378125', b'378128'], dtype=object)

In [None]:
unique_user_ids[600:]

array([b'380981', b'380990', b'380993', b'381005', b'381011', b'381014',
       b'381023', b'381026', b'381044', b'381047', b'381056', b'381059',
       b'381062', b'381068', b'381071', b'381077', b'381080', b'381083',
       b'381086', b'381089', b'381092', b'381095', b'381098', b'381101',
       b'381104', b'381107', b'381110', b'381113', b'381116', b'381125',
       b'381128', b'381137', b'381140', b'381143', b'381146'],
      dtype=object)

In [None]:
user_id = 381146
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381146: [b'Menjadi Android Developer Expert'
 b'Belajar Fundamental Aplikasi Flutter'
 b'Menjadi Front-End Web Developer Expert']


In [None]:
user_id = 381147
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381147: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [None]:
user_id = 381150
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381150: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [None]:
user_id = 381152
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381152: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']


In [None]:
user_id = 381155
# Get recommendations.
_, course_name = index(np.array([str(user_id)]))
print(f"Recommendations for user {user_id}: {course_name[0, :3]}".format(user_id))

Recommendations for user 381155: [b'Menjadi Android Developer Expert' b'Belajar Membangun LINE Chatbot'
 b'Belajar Pengembangan Machine Learning']
