In [None]:
!pip install keras_tuner



In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import keras_tuner as kt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, TFAutoModel

In [None]:
# 1. Load Dataset
df = pd.read_csv("/content/cleaned_data_hotel.csv")
df['Description'] = df['Description'].str.lower()
print("Dataset info:")
print(df.info())

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         90 non-null     object
 1   Description  90 non-null     object
 2   Lokasi       88 non-null     object
dtypes: object(3)
memory usage: 2.2+ KB
None


In [None]:
# 2. Prepare tokenizer and pretrained IndoBERT model (TensorFlow)
model_name = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = TFAutoModel.from_pretrained(model_name)

Some layers from the model checkpoint at indobenchmark/indobert-base-p2 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# 3. Generate embeddings from text - returns numpy array (1, hidden_size)
def get_bert_embeddings(texts):
    # texts: list of strings or single string
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, return_tensors='tf', padding=True, truncation=True, max_length=512)
    outputs = bert_model(inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
    return embeddings.numpy()

In [None]:
# 4. Prepare Training Data for Keras model
# Generate embeddings for all hotel descriptions
hotel_desc_embeddings = get_bert_embeddings(df['Description'].tolist())

# Encode hotel names as class labels for classification task
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Name'])

# Re-encode labels after combining rare classes
labels = label_encoder.fit_transform(df['Name'])

# Generate embeddings again after modifying the dataset
hotel_desc_embeddings = get_bert_embeddings(df['Description'].tolist())

# Split into train and validation set
X_train, X_val, y_train, y_val = train_test_split(hotel_desc_embeddings, labels, test_size=0.2, random_state=42)


In [None]:
# 5. Build tunable keras model function for hyperparameter tuning
def build_model(hp):
    inputs = tf.keras.Input(shape=(hotel_desc_embeddings.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 1, 3)):
        units = hp.Int(f'units_{i}', min_value=64, max_value=512, step=64)
        x = tf.keras.layers.Dense(units, activation='relu')(x)
        dropout_rate = hp.Float(f'dropout_{i}', 0.1, 0.5, step=0.1)
        x = tf.keras.layers.Dropout(dropout_rate)(x)

    outputs = tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    lr = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

In [None]:
# 6. Run hyperparameter tuning with Keras Tuner Hyperband
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=20,
    factor=3,
    directory='kt_tuner_dir',
    project_name='hotel_nlp_recommendation'
)

# Early stopping callback to reduce overfitting
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

print("Starting hyperparameter search...")
tuner.search(X_train, y_train, epochs=20, validation_data=(X_val, y_val), callbacks=[stop_early])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters found:")
print(f"Number of layers: {best_hps.get('num_layers')}")
for i in range(best_hps.get('num_layers')):
    print(f"Layer {i} units: {best_hps.get(f'units_{i}')}, dropout: {best_hps.get(f'dropout_{i}')}")

print(f"Learning rate: {best_hps.get('learning_rate')}")

Reloading Tuner from kt_tuner_dir/hotel_nlp_recommendation/tuner0.json
Starting hyperparameter search...
Best hyperparameters found:
Number of layers: 2
Layer 0 units: 512, dropout: 0.5
Layer 1 units: 64, dropout: 0.1
Learning rate: 0.0001


In [None]:
# 7. Build the best model and train fully
model = tuner.hypermodel.build(best_hps)

history = model.fit(
    X_train, y_train,
    epochs=30,
    validation_data=(X_val, y_val),
    callbacks=[stop_early]
)

Epoch 1/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4s/step - accuracy: 0.0109 - loss: 4.9049 - val_accuracy: 0.0556 - val_loss: 4.7097
Epoch 2/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 90ms/step - accuracy: 0.0000e+00 - loss: 4.8556 - val_accuracy: 0.0556 - val_loss: 4.7460
Epoch 3/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.0109 - loss: 4.7994 - val_accuracy: 0.0556 - val_loss: 4.7810
Epoch 4/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 149ms/step - accuracy: 0.0069 - loss: 4.6832 - val_accuracy: 0.0000e+00 - val_loss: 4.8232
Epoch 5/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.0109 - loss: 4.5706 - val_accuracy: 0.0000e+00 - val_loss: 4.8532
Epoch 6/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.0443 - loss: 4.4896 - val_accuracy: 0.0000e+00 - val_loss: 4.8617


In [None]:
# 8. Helper function to recommend hotels based on similarity of embeddings refined by trained Keras model
def recommend_hotels(user_prompt, location, top_n=5):
    user_embedding = get_bert_embeddings(user_prompt)
    intermediate_layer_model = tf.keras.Model(
        inputs=model.input,
        outputs=model.layers[-2].output
    )
    user_refined_embedding = intermediate_layer_model(user_embedding).numpy()

    filtered_df = df[df['Lokasi'].str.lower() == location.lower()]
    filtered_embeddings = []
    filtered_names = []
    for idx, row in filtered_df.iterrows():
        emb = hotel_desc_embeddings[idx:idx+1]
        refined_emb = intermediate_layer_model(emb).numpy()
        filtered_embeddings.append(refined_emb[0])
        filtered_names.append(row['Name'])

    filtered_embeddings = np.array(filtered_embeddings)
    user_vec = user_refined_embedding

    sims = cosine_similarity(user_vec, filtered_embeddings)[0]
    top_indices = sims.argsort()[::-1][:top_n]
    recommendations = [(filtered_names[i], sims[i]) for i in top_indices]

    return recommendations

In [None]:
# 9. Example usage: Getting recommendations after tuning and training
if __name__ == "__main__":
    user_prompt = "hotel dengan kolam renang"
    location = "Kota Malang"

    print(f"Rekomendasi hotel di {location} untuk prompt '{user_prompt}':")
    recs = recommend_hotels(user_prompt, location)
    for name, score in recs:
        print(f"- {name}: {score:.4f}")


Rekomendasi hotel di Kota Malang untuk prompt 'hotel dengan kolam renang':
- Front One Budget Malang by Azana: 0.6831
- The Alana Hotel Malang: 0.6426
- Sans Hotel La Vida Malang: 0.6183
- Sweet Garden Boutique Guest House: 0.5581
- Atria Hotel Malang: 0.5085


In [None]:
# Cek nilai unik pada kolom Lokasi
print(df['Lokasi'].unique())


['Banyuwangi' 'Mojopanggung' nan 'Blitar' 'Jember' 'Kota Kediri' 'Ngasem'
 'Mojoroto' 'Sidoharjo' 'Lamongan' 'Rejosari' 'Plosowahyu'
 'Tumenggung Baru' 'Banaran' 'Paciran' 'Kota Madiun' 'Kaibon'
 'Kota Malang' 'Surabaya' 'Kota Batu']


In [None]:
# Periksa hotel yang digabung menjadi "Other"
print(df[df['Name'] == 'Other'])


Empty DataFrame
Columns: [Name, Description, Lokasi]
Index: []
