In [None]:
!wget https://tmpfiles.org/1545912/tourism.csv

In [1]:
!pip install shortuuid
!pip install schedule



In [2]:
import pandas as pd
import numpy as np
import json
import random
import shortuuid

def generate_dataset(file_name):
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_name)

    # Now you can work with the DataFrame 'df'
    df = df[['itemId', 'description', 'category', 'city']]

    # Your JSON data
    json_data = [
        {
            "userId": "user1",
            "clickedItems": random.choices(df['itemId'], k=5)
        },
        {
            "userId": "user2",
            "clickedItems": random.choices(df['itemId'], k=5)
        },
        {
            "userId": "user3",
            "clickedItems": random.choices(df['itemId'], k=5)
        },
        {
            "userId": "user4",
            "clickedItems": random.choices(df['itemId'], k=5)
        },
        {
            "userId": "user5",
            "clickedItems": random.choices(df['itemId'], k=5)
        }
    ]

    # Convert JSON to DataFrame
    data = []
    for user in json_data:
        for item in user['clickedItems']:
            data.append([user['userId'], item, 1])
    df_user_clicked = pd.DataFrame(data, columns=['userId', 'itemId', 'clicked'])

    # Create a DataFrame of all possible user-item pairs
    all_users = df_user_clicked['userId'].unique()
    all_items = df['itemId']

    data_all = []
    for user in all_users:
        for item in all_items:
            data_all.append([user, item])

    df_all = pd.DataFrame(data_all, columns=['userId', 'itemId'])

    # Merge df_all with df_items to add category, description, and other_attribute
    df_all = pd.merge(df_all, df, on='itemId', how='left')

    # Merge the user clicked data onto the DataFrame of all user-item pairs
    df_final = pd.merge(df_all, df_user_clicked, how='left', on=['userId', 'itemId'], suffixes=('', '_user_clicked'))

    # If the user has clicked the item, replace the 'clicked' value in df_all with the one from df_user_clicked
    df_final['clicked'].fillna(0, inplace=True)  # fill NaNs with 0

    return df_final

df = generate_dataset('../data/tourism.csv')

df

Unnamed: 0,userId,itemId,description,category,city,clicked
0,user1,7EzW8sTh9gaKg9UoBFacXX,Air terjun Gitgit adalah air terjun yang terle...,3,39,0.0
1,user1,V5xEUjngBkMc5tTdiVKeGh,Air terjun Tegenungan adalah air terjun yang t...,3,46,0.0
2,user1,G6nkGoXxWC95tkhcQn2i4Y,Alun-Alun Purworejo adalah sebuah alun-alun at...,5,140,0.0
3,user1,GLXR7hBpCKsZUk3FgEqDvQ,Bali Safari & Marine Park (BSMP) merupakan tem...,5,49,0.0
4,user1,SEpL8LSMzbH6kXZPsRykkW,Batu Secret Zoo merupakan tempat wisata dan ke...,3,22,0.0
...,...,...,...,...,...,...
4490,user5,daLxypzZZiYtPpH5Cdytwi,"Sejak diresmikan pada bulan Desember 2017, Atl...",5,159,0.0
4491,user5,EfZgg4m3Zn95W6icFjsaUy,Taman Hiburan Rakyat atau THR tentunya sudah t...,5,159,0.0
4492,user5,NhhwKnhv9ZY6YYCx9NUUXv,Air mancur menari atau dancing fountain juga a...,5,159,0.0
4493,user5,4oKKjyHRdfSxNyn9AWSk6Z,Taman Flora adalah salah satu taman kota di Su...,5,159,0.0


In [3]:
import pandas as pd
import random
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Split the data into a training set and a validation set
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

# Define the maximum number of words in the texts to keep based on word frequency
max_words = 500

# Tokenizers
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['description'])

# Convert the texts to sequences
description_sequences_train = tokenizer.texts_to_sequences(df_train['description'])
description_sequences_val = tokenizer.texts_to_sequences(df_val['description'])

# Pad the sequences so they are all the same length
description_padded_train = pad_sequences(description_sequences_train, maxlen=max_words)
description_padded_val = pad_sequences(description_sequences_val, maxlen=max_words)

# Custom Label Encoding for user_id and item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

encoded_user_ids_train = user_encoder.fit_transform(df_train['userId'])
encoded_item_ids_train = item_encoder.fit_transform(df_train['itemId'])

encoded_user_ids_val = user_encoder.transform(df_val['userId'])
encoded_item_ids_val = item_encoder.transform(df_val['itemId'])

labels_train = df_train['clicked']
labels_val = df_val['clicked']

# Build the model
user_input = layers.Input(shape=(1,), name='user')
item_input = layers.Input(shape=(1,), name='item')
description_input = layers.Input(shape=(max_words,), name='description')

user_embedding = layers.Embedding(input_dim=len(user_encoder.classes_), output_dim=50)(user_input)
item_embedding = layers.Embedding(input_dim=len(item_encoder.classes_), output_dim=50)(item_input)
description_embedding = layers.Embedding(input_dim=max_words, output_dim=50)(description_input)

user_embedding = layers.Flatten()(user_embedding)
item_embedding = layers.Flatten()(item_embedding)
description_embedding = layers.GlobalAveragePooling1D()(description_embedding)

concatenated = layers.Concatenate()([user_embedding, item_embedding, description_embedding])

dense1 = layers.Dense(128, activation='relu')(concatenated)
dense2 = layers.Dense(64, activation='relu')(dense1)
out = layers.Dense(1, activation='sigmoid')(dense2)

model = tf.keras.Model(inputs=[user_input, item_input, description_input], outputs=out)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([encoded_user_ids_train, encoded_item_ids_train, description_padded_train], labels_train, epochs=10, validation_data=([encoded_user_ids_val, encoded_item_ids_val, description_padded_val], labels_val))

# Save the model, label encoders, and tokenizers for future use
model.save('recommendation_model.h5')
np.save('user_encoder_classes.npy', user_encoder.classes_)
np.save('item_encoder_classes.npy', item_encoder.classes_)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [4]:
import schedule
import time
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences

class Recommender(tf.keras.Model):
    def __init__(self, model_path, user_encoder_path, item_encoder_path, tokenizer_path, df, max_words):
        super(Recommender, self).__init__()
        self.model_path = model_path
        self.user_encoder_path = user_encoder_path
        self.item_encoder_path = item_encoder_path
        self.tokenizer_path = tokenizer_path
        self.df = df
        self.max_words = max_words
        self.reload_model()

    def reload_model(self):
        self.model = tf.keras.models.load_model(self.model_path)

        self.user_encoder = LabelEncoder()
        self.user_encoder.classes_ = np.load(self.user_encoder_path, allow_pickle=True)

        self.item_encoder = LabelEncoder()
        self.item_encoder.classes_ = np.load(self.item_encoder_path, allow_pickle=True)

        with open(self.tokenizer_path, 'rb') as handle:
            self.tokenizer = pickle.load(handle)
            
    @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.string)])
    def serving_default(self, new_user_id):
        self.predict(new_user_id)

    def predict(self, new_user_id):
        if new_user_id not in self.user_encoder.classes_:
            print("New user detected. Assigning random existing user for prediction.")
            new_user_id = np.random.choice(self.user_encoder.classes_)

        all_item_ids = self.df['itemId'].unique().tolist()
        all_descriptions = []

        for item_id in all_item_ids:
            item_data = self.df[self.df['itemId'] == item_id].iloc[0]
            all_descriptions.append(item_data['description'])

        encoded_new_user_id = self.user_encoder.transform([new_user_id]*len(all_item_ids))
        encoded_all_item_ids = self.item_encoder.transform(all_item_ids)

        description_sequences = self.tokenizer.texts_to_sequences(all_descriptions)
        description_padded = pad_sequences(description_sequences, maxlen=self.max_words)

        predictions = self.model.predict([encoded_new_user_id, encoded_all_item_ids, description_padded])
        
        top_10_indices = np.argsort(predictions[:, 0])[-10:]

        print("Top 10 recommendations for", new_user_id, "are:")
        for index in reversed(top_10_indices):
            print(f'Item: {all_item_ids[index]}, predicted click probability: {predictions[index][0]}')

rec = Recommender('recommendation_model.h5', 'user_encoder_classes.npy', 'item_encoder_classes.npy', 'tokenizer.pickle', df, 500)

def job():
    rec.reload_model()

# Schedule the task every day at 12am
schedule.every().day.at("00:00").do(job)

#while True:
#    schedule.run_pending()
#    time.sleep(1)

rec.predict("budiman")

New user detected. Assigning random existing user for prediction.
Top 10 recommendations for user4 are:
Item: 7EBnBHULL5hzJHJtwhT3u8, predicted click probability: 0.39547663927078247
Item: 7XJoM9gxMCkQgtoorP96Lu, predicted click probability: 0.39051663875579834
Item: c2psmmoKXutCkAqQy2n7yH, predicted click probability: 0.3859640657901764
Item: RJPfATFovFZ2TMtc3mgzRS, predicted click probability: 0.38472145795822144
Item: CjDtzMwR5cpMQbPZvvMLSn, predicted click probability: 0.3813520073890686
Item: gxpzMkYDfFhFVzTg2EmdxS, predicted click probability: 0.37987014651298523
Item: 5ps3MEyuw8fPo7w4tXuGHD, predicted click probability: 0.3602381944656372
Item: gLRuRSc7fZJh2fLpdwN3Yu, predicted click probability: 0.3513978123664856
Item: aFYP6hxe6P42obyRVkcinR, predicted click probability: 0.3478638231754303
Item: NEMGgEeMEkJFtyCdp5hdJX, predicted click probability: 0.346917986869812
