In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Prepare The Data

In [2]:
# df_hotel = pd.read_csv("/content/drive/MyDrive/Capstone/ml-workspace/dataset/dataset-2/listhotel.csv")
# # df_user = pd.read_csv("/content/drive/MyDrive/Capstone/ml-workspace/dataset/dataset-2/user-history.csv")
# df_user = pd.read_csv("/content/drive/MyDrive/Capstone/ml-workspace/dataset/dataset-2/user-history-dummy.csv")

df_hotel = pd.read_csv("../data/listhotel.csv")
df_user = pd.read_csv("../data/user_history_dummy_3000_1500.csv")
# df_user = pd.read_csv("user_history_dummy_5000.csv")
# df_user = pd.read_csv("user_history_dummy_5000_2.csv")
# df_user = pd.read_csv("user-history-dummy.csv")

In [3]:
df_hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   hotel_id      23 non-null     object
 1   hotel_name    23 non-null     object
 2   cheap         23 non-null     int64 
 3   luxurious     23 non-null     int64 
 4   clean         23 non-null     int64 
 5   cozy          23 non-null     int64 
 6   good service  23 non-null     int64 
 7   nice view     23 non-null     int64 
 8   parking       23 non-null     int64 
 9   pool          23 non-null     int64 
 10  spa           23 non-null     int64 
 11  gym           23 non-null     int64 
 12  wifi          23 non-null     int64 
 13  strategic     23 non-null     int64 
 14  delicious     23 non-null     int64 
 15  breakfast     23 non-null     int64 
 16  safety        23 non-null     int64 
 17  family        23 non-null     int64 
 18  pet           23 non-null     int64 
 19  aesthetic 

In [4]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   activity_id  3000 non-null   object
 1   user_id      3000 non-null   object
 2   hotel_id     3000 non-null   object
 3   is_bookmark  3000 non-null   int64 
 4   is_watching  3000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 117.3+ KB


In [5]:
df_hotel.columns

Index(['hotel_id', 'hotel_name', 'cheap', 'luxurious', 'clean', 'cozy',
       'good service', 'nice view', 'parking', 'pool', 'spa', 'gym', 'wifi',
       'strategic', 'delicious', 'breakfast', 'safety', 'family', 'pet',
       'aesthetic', 'disability', 'laundry'],
      dtype='object')

In [6]:
def generate_training_data(df_user, df_hotel):

  df_training = pd.merge(df_user, df_hotel.drop(['hotel_name'],axis = 1), on='hotel_id', how='inner')

  # df_training['score'] = df_training['is_booking'] * 4 + df_training['is_watching']

  df_training['score'] = df_training['is_bookmark']

  df_training = df_training[['user_id','hotel_id','score']]

  return df_training

In [7]:
df_training = generate_training_data(df_user, df_hotel)
df_training.head()

Unnamed: 0,user_id,hotel_id,score
0,u1197,h1013,1
1,u1386,h1019,0
2,u0941,h1002,1
3,u0089,h1016,0
4,u0389,h1021,1


In [8]:
labels = ['cheap', 'luxurious', 'clean', 'cozy',
       'good service', 'nice view', 'parking', 'pool', 'spa', 'gym', 'wifi',
       'strategic', 'delicious', 'breakfast', 'safety', 'family', 'pet',
       'aesthetic', 'disability', 'laundry']

def generate_user_score(df_user, df_hotel, user_id, labels):
  df_user_history = df_user[df_user['user_id'] == user_id]

  df_user_history = pd.merge(df_user_history, df_hotel.drop(['hotel_name'],axis = 1), on='hotel_id', how='inner')

  # for label in labels:
  #     df_user_history[label] = df_user_history[label] + df_user_history['is_booking'] * 5

  df_user_history = df_user_history.drop(['activity_id', 'hotel_id','is_watching', 'is_bookmark'], axis=1)

  df_user_history = df_user_history.groupby('user_id').mean()

  return df_user_history

def generate_user_score_for_all_user(df_user, df_hotel, labels):

  df_user_history = pd.merge(df_user, df_hotel.drop(['hotel_name'],axis = 1), on='hotel_id', how='inner')

  # for label in labels:
  #     df_user_history[label] = df_user_history[label] + df_user_history['is_booking'] * 5

  df_user_history = df_user_history.drop(['activity_id', 'hotel_id','is_watching', 'is_bookmark'], axis=1)

  df_user_history = df_user_history.groupby('user_id').mean()

  return df_user_history


In [9]:
df_user = generate_user_score_for_all_user(df_user, df_hotel, labels)
df_user.head(10)

Unnamed: 0_level_0,cheap,luxurious,clean,cozy,good service,nice view,parking,pool,spa,gym,wifi,strategic,delicious,breakfast,safety,family,pet,aesthetic,disability,laundry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
u0001,0.333333,0.666667,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.333333,0.333333,0.0,1.0,1.0
u0002,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.5,0.5,0.5,1.0
u0004,1.0,0.0,0.5,1.0,1.0,0.0,1.0,1.0,0.5,0.5,1.0,0.5,0.5,1.0,0.5,0.5,0.0,0.0,0.5,1.0
u0005,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.0,0.5,1.0
u0007,0.333333,0.666667,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,0.666667,1.0,1.0,1.0,1.0,0.333333,0.0,0.666667,0.0,1.0,1.0
u0008,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
u0009,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,0.5,1.0,1.0,0.5,1.0,0.0,0.5,1.0,1.0
u0010,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
u0013,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,1.0,1.0,0.666667,1.0,0.333333,1.0,0.0,0.333333,1.0,0.666667
u0015,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


In [10]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1280 entries, u0001 to u1500
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cheap         1280 non-null   float64
 1   luxurious     1280 non-null   float64
 2   clean         1280 non-null   float64
 3   cozy          1280 non-null   float64
 4   good service  1280 non-null   float64
 5   nice view     1280 non-null   float64
 6   parking       1280 non-null   float64
 7   pool          1280 non-null   float64
 8   spa           1280 non-null   float64
 9   gym           1280 non-null   float64
 10  wifi          1280 non-null   float64
 11  strategic     1280 non-null   float64
 12  delicious     1280 non-null   float64
 13  breakfast     1280 non-null   float64
 14  safety        1280 non-null   float64
 15  family        1280 non-null   float64
 16  pet           1280 non-null   float64
 17  aesthetic     1280 non-null   float64
 18  disability    1280 non-null 

In [11]:
df_hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   hotel_id      23 non-null     object
 1   hotel_name    23 non-null     object
 2   cheap         23 non-null     int64 
 3   luxurious     23 non-null     int64 
 4   clean         23 non-null     int64 
 5   cozy          23 non-null     int64 
 6   good service  23 non-null     int64 
 7   nice view     23 non-null     int64 
 8   parking       23 non-null     int64 
 9   pool          23 non-null     int64 
 10  spa           23 non-null     int64 
 11  gym           23 non-null     int64 
 12  wifi          23 non-null     int64 
 13  strategic     23 non-null     int64 
 14  delicious     23 non-null     int64 
 15  breakfast     23 non-null     int64 
 16  safety        23 non-null     int64 
 17  family        23 non-null     int64 
 18  pet           23 non-null     int64 
 19  aesthetic 

## Modeling

In [12]:
df_hotel = df_hotel.drop(['hotel_name'],axis = 1)
df_hotel.head()

Unnamed: 0,hotel_id,cheap,luxurious,clean,cozy,good service,nice view,parking,pool,spa,...,wifi,strategic,delicious,breakfast,safety,family,pet,aesthetic,disability,laundry
0,h1001,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,0
1,h1002,0,1,1,1,1,1,1,1,0,...,1,1,1,1,0,1,0,1,1,1
2,h1003,1,0,1,1,1,0,1,1,1,...,1,1,1,1,1,1,0,1,1,1
3,h1004,0,1,1,1,1,1,1,1,1,...,1,1,0,1,0,1,0,0,1,1
4,h1005,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,1,1,0


In [13]:
df_user = df_user.reset_index()
# df_user = df_user.drop(['user_id'],axis = 1)
df_user.head()

Unnamed: 0,user_id,cheap,luxurious,clean,cozy,good service,nice view,parking,pool,spa,...,wifi,strategic,delicious,breakfast,safety,family,pet,aesthetic,disability,laundry
0,u0001,0.333333,0.666667,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,...,1.0,1.0,0.666667,1.0,0.666667,0.333333,0.333333,0.0,1.0,1.0
1,u0002,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.5,0.5,0.5,1.0
2,u0004,1.0,0.0,0.5,1.0,1.0,0.0,1.0,1.0,0.5,...,1.0,0.5,0.5,1.0,0.5,0.5,0.0,0.0,0.5,1.0
3,u0005,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.0,0.5,1.0
4,u0007,0.333333,0.666667,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,...,1.0,1.0,1.0,1.0,0.333333,0.0,0.666667,0.0,1.0,1.0


In [14]:
df_user.columns

Index(['user_id', 'cheap', 'luxurious', 'clean', 'cozy', 'good service',
       'nice view', 'parking', 'pool', 'spa', 'gym', 'wifi', 'strategic',
       'delicious', 'breakfast', 'safety', 'family', 'pet', 'aesthetic',
       'disability', 'laundry'],
      dtype='object')

In [15]:
df_training

Unnamed: 0,user_id,hotel_id,score
0,u1197,h1013,1
1,u1386,h1019,0
2,u0941,h1002,1
3,u0089,h1016,0
4,u0389,h1021,1
...,...,...,...
2995,u1318,h1010,0
2996,u0164,h1013,0
2997,u1036,h1002,1
2998,u1295,h1015,0


In [16]:
df_hotel_train = pd.merge(df_training, df_hotel, on='hotel_id', how='left')
df_hotel_train = df_hotel_train.drop(['user_id', 'hotel_id', 'score'],axis = 1)
df_hotel_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   cheap         3000 non-null   int64
 1   luxurious     3000 non-null   int64
 2   clean         3000 non-null   int64
 3   cozy          3000 non-null   int64
 4   good service  3000 non-null   int64
 5   nice view     3000 non-null   int64
 6   parking       3000 non-null   int64
 7   pool          3000 non-null   int64
 8   spa           3000 non-null   int64
 9   gym           3000 non-null   int64
 10  wifi          3000 non-null   int64
 11  strategic     3000 non-null   int64
 12  delicious     3000 non-null   int64
 13  breakfast     3000 non-null   int64
 14  safety        3000 non-null   int64
 15  family        3000 non-null   int64
 16  pet           3000 non-null   int64
 17  aesthetic     3000 non-null   int64
 18  disability    3000 non-null   int64
 19  laundry       3000 non-null

In [17]:
df_user_train = pd.merge(df_training, df_user, on='user_id', how='left')
df_user_train = df_user_train.drop(['user_id', 'hotel_id', 'score'],axis = 1)
df_user_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cheap         3000 non-null   float64
 1   luxurious     3000 non-null   float64
 2   clean         3000 non-null   float64
 3   cozy          3000 non-null   float64
 4   good service  3000 non-null   float64
 5   nice view     3000 non-null   float64
 6   parking       3000 non-null   float64
 7   pool          3000 non-null   float64
 8   spa           3000 non-null   float64
 9   gym           3000 non-null   float64
 10  wifi          3000 non-null   float64
 11  strategic     3000 non-null   float64
 12  delicious     3000 non-null   float64
 13  breakfast     3000 non-null   float64
 14  safety        3000 non-null   float64
 15  family        3000 non-null   float64
 16  pet           3000 non-null   float64
 17  aesthetic     3000 non-null   float64
 18  disability    3000 non-null 

In [18]:
df_training.head()
y_train = df_training['score'].to_numpy()
y_train

array([1, 0, 1, ..., 1, 0, 1], dtype=int64)

In [19]:
# # scale training data
# item_train_unscaled = hotel_train
# user_train_unscaled = user_train
# y_train_unscaled    = y_train

# scalerItem = StandardScaler()
# scalerItem.fit(item_train)
# item_train = scalerItem.transform(item_train)

# scalerUser = StandardScaler()
# scalerUser.fit(user_train)
# user_train = scalerUser.transform(user_train)

# scalerTarget = MinMaxScaler((-1, 1))
# scalerTarget.fit(y_train.reshape(-1, 1))
# y_train = scalerTarget.transform(y_train.reshape(-1, 1))
# #ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

# # print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
# # print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

In [20]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dot

# Hotel feature input
hotel_input = Input(shape=(20,), name="hotel_features")
hotel_embedding = Dense(512, activation="relu", name="hotel_embedding1")(hotel_input)
hotel_embedding = Dense(256, activation="relu", name="hotel_embedding")(hotel_embedding)

# User profile input
user_input = Input(shape=(20,), name="user_features")
user_embedding = Dense(512, activation="relu", name="user_embedding1")(user_input)
user_embedding = Dense(256, activation="relu", name="user_embedding")(user_embedding)

# Compute similarity
similarity = Dot(axes=1, normalize=True, name="similarity")([hotel_embedding, user_embedding])

# Output layer
output = Dense(1, activation="sigmoid", name="output")(similarity)

# Build model
model = Model(inputs=[hotel_input, user_input], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 hotel_features (InputLayer)    [(None, 20)]         0           []                               
                                                                                                  
 user_features (InputLayer)     [(None, 20)]         0           []                               
                                                                                                  
 hotel_embedding1 (Dense)       (None, 512)          10752       ['hotel_features[0][0]']         
                                                                                                  
 user_embedding1 (Dense)        (None, 512)          10752       ['user_features[0][0]']          
                                                                                              

In [21]:
df_hotel_train = df_hotel_train.to_numpy()
df_user_train = df_user_train.to_numpy()

In [22]:
print(df_hotel_train.shape)  # Example: (1000, 15)
print(df_user_train.shape)   # Example: (1000, 15)
print(y_train.shape)      # Example: (1000,)

(3000, 20)
(3000, 20)
(3000,)


In [23]:
history = model.fit(
    [df_hotel_train, df_user_train],
    y_train,
    batch_size=32,
    epochs=25,
    validation_split=0.2
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


## Test model

In [24]:
# Data manual
data = [
    {"activity_id": "a0001", "user_id": "u0001", "hotel_id": "h1001", "is_bookmark": 1, "is_watching": 0},
    {"activity_id": "a0002", "user_id": "u0002", "hotel_id": "h1002", "is_bookmark": 0, "is_watching": 1},
    {"activity_id": "a0003", "user_id": "u0003", "hotel_id": "h1003", "is_bookmark": 0, "is_watching": 1},
    {"activity_id": "a0004", "user_id": "u0004", "hotel_id": "h1004", "is_bookmark": 1, "is_watching": 0},
    {"activity_id": "a0005", "user_id": "u0005", "hotel_id": "h1010", "is_bookmark": 0, "is_watching": 1},
    {"activity_id": "a0005", "user_id": "u0005", "hotel_id": "h1011", "is_bookmark": 0, "is_watching": 1},
    {"activity_id": "a0005", "user_id": "u0005", "hotel_id": "h1002", "is_bookmark": 0, "is_watching": 1},
    {"activity_id": "a0005", "user_id": "u0005", "hotel_id": "h1005", "is_bookmark": 1, "is_watching": 0},
    {"activity_id": "a0005", "user_id": "u0005", "hotel_id": "h1003", "is_bookmark": 1, "is_watching": 0},
    {"activity_id": "a0005", "user_id": "u0005", "hotel_id": "h1007", "is_bookmark": 1, "is_watching": 0},
    {"activity_id": "a0005", "user_id": "u0005", "hotel_id": "h1006", "is_bookmark": 1, "is_watching": 0},
]

# Membuat DataFrame
df_manual = pd.DataFrame(data)

df_hotel1 = pd.read_csv("../data/listhotel.csv")

user5 = generate_user_score(df_manual, df_hotel1, "u0005", labels)
user5

Unnamed: 0_level_0,cheap,luxurious,clean,cozy,good service,nice view,parking,pool,spa,gym,wifi,strategic,delicious,breakfast,safety,family,pet,aesthetic,disability,laundry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
u0005,0.285714,0.714286,1.0,1.0,1.0,0.714286,1.0,1.0,0.857143,1.0,1.0,0.857143,1.0,0.857143,0.428571,0.857143,0.142857,0.571429,0.857143,0.857143


In [25]:
user_feature = [1, 1.0, 0.8, 0.9, 1.0, 0.6, 0.7, 0.9, 1.0, 0.4, 1, 0.7, 1.0, 0.5, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0]

user_vector = np.array(user5).reshape(1, -1)
hotel_ids = df_hotel['hotel_id'].to_numpy()
hotel_vectors = df_hotel.drop(['hotel_id'],axis = 1).to_numpy()

user_input = np.tile(user_vector, (hotel_vectors.shape[0], 1))

# Combine inputs for prediction
prediction_input = [hotel_vectors, user_input]

# Predict scores
predicted_scores = model.predict(prediction_input).flatten()

# Rank hotels by predicted scores
recommendations = sorted(zip(hotel_ids, predicted_scores), key=lambda x: x[1], reverse=True)

# Display the top N recommendations
top_n = 10
print(f"Top {top_n} recommendations for user:")
for hotel_id, score in recommendations[:top_n]:
    print(f"Hotel ID: {hotel_id}, Score: {score:.2f}")

Top 10 recommendations for user:
Hotel ID: h1018, Score: 0.34
Hotel ID: h1010, Score: 0.33
Hotel ID: h1002, Score: 0.32
Hotel ID: h1020, Score: 0.32
Hotel ID: h1016, Score: 0.32
Hotel ID: h1001, Score: 0.32
Hotel ID: h1021, Score: 0.32
Hotel ID: h1008, Score: 0.32
Hotel ID: h1006, Score: 0.32
Hotel ID: h1005, Score: 0.32


## Save Model

In [None]:
# change the name model 
# model.save("../model/model2.h5")