In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [7]:
# Download NLTK stop words
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# Load your data
df = pd.read_excel('indonesia_tourism.xlsx', sheet_name="indonesia_tourism")

In [9]:
df['Description']

0      Monumen Nasional atau yang populer disingkat d...
1      Kota tua di Jakarta, yang juga bernama Kota Tu...
2      Dunia Fantasi atau disebut juga Dufan adalah t...
3      Taman Mini Indonesia Indah merupakan suatu kaw...
4      Atlantis Water Adventure atau dikenal dengan A...
                             ...                        
432    Museum Negeri Mpu Tantular adalah sebuah museu...
433    Taman Bungkul adalah taman wisata kota yang te...
434    Air mancur menari atau dancing fountain juga a...
435    Taman Flora adalah salah satu taman kota di Su...
436    Gereja Katolik Kelahiran Santa Perawan Maria m...
Name: Description, Length: 437, dtype: object

In [10]:
# Advanced text preprocessing
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words if w.isalpha()]  # Remove punctuation and numbers
    words = [w for w in words if not w in stop_words]  # Remove stop words
    return ' '.join(words)
description_stopwords = pd.DataFrame()
description_stopwords['description'] = df['Description'].apply(preprocess_text)

In [11]:
description_stopwords

Unnamed: 0,description
0,monumen nasional populer disingkat monas tugu ...
1,kota tua jakarta bernama kota tua berpusat fat...
2,dunia fantasi dufan hiburan terletak kawasan t...
3,taman mini indonesia indah kawasan taman wisat...
4,atlantis water adventure dikenal atlantis anco...
...,...
432,museum negeri mpu tantular museum negeri berlo...
433,taman bungkul taman wisata kota terletak pusat...
434,air mancur menari dancing fountain kawasan jem...
435,taman flora salah taman kota surabaya fasilita...


In [12]:
# Concatenate place names with reviews
description_stopwords['combined_data'] = df['Place_Name'] + ' ' + description_stopwords['description']

In [13]:
# Prepare the dataset for TensorFlow
texts = description_stopwords['combined_data'].values
labels = df['Place_Name']

In [14]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=200)

In [15]:
# Load GloVe embeddings
def load_glove_embeddings(glove_path, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(glove_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [16]:
glove_path = 'glove.6B.100d.txt'  # Ensure you have the GloVe embeddings file
embedding_dim = 100
embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim)

In [17]:
# Improved Model Architecture
inputs = tf.keras.Input(shape=(200,))
x = tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False)(inputs)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(embedding_dim)(x)

model = tf.keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='mse')


In [18]:
# Train the Model
text_embeddings = model.predict(data)



In [19]:
similarity_scores_dict = {}

In [20]:
review_df= pd.read_excel('indonesia_tourism_reviews.xlsx')

In [21]:
df

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,...,Jam Operasional (Selasa),Jam Operasional (Rabu),Jam Operasional (Kamis),Jam Operasional (Kamis).1,Jam Operasional (Jumat),Jam Operasional (Sabtu),Jam Operasional (Minggu),"Preferensi user (solo trip, family) cadangan",Jarak,Review
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{""lat"": -6.1753924, ""lng"": 106.8271528}",-6.175392,...,08:00-12:00,08:00-12:00,08:00-12:00,08:00-12:00,08:00-12:00,08:00-12:00,08:00-12:00,,,
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{""lat"": -6.137644799999999, ""lng"": 106.8171245}",-6.137645,...,,,,,,,,,,
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{""lat"": -6.125312399999999, ""lng"": 106.8335377}",-6.125312,...,,,,,,,,,,
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,240.0,"{""lat"": -6.302445899999999, ""lng"": 106.8951559}",-6.302446,...,,,,,,,,,,
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{""lat"": -6.12419, ""lng"": 106.839134}",-6.124190,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,433,Museum Mpu Tantular,Museum Negeri Mpu Tantular adalah sebuah museu...,Budaya,Surabaya,2000,4.4,45.0,"{""lat"": -7.4338593, ""lng"": 112.7199058}",-7.433859,...,,,,,,,,,,
433,434,Taman Bungkul,Taman Bungkul adalah taman wisata kota yang te...,Taman Hiburan,Surabaya,0,4.6,,"{""lat"": -7.291346799999999, ""lng"": 112.7398218}",-7.291347,...,,,,,,,,,,
434,435,Taman Air Mancur Menari Kenjeran,Air mancur menari atau dancing fountain juga a...,Taman Hiburan,Surabaya,0,4.4,45.0,"{""lat"": -7.2752955, ""lng"": 112.7549381}",-7.275296,...,,,,,,,,,,
435,436,Taman Flora Bratang Surabaya,Taman Flora adalah salah satu taman kota di Su...,Taman Hiburan,Surabaya,0,4.6,,"{""lat"": -7.294330299999999, ""lng"": 112.7617534}",-7.294330,...,,,,,,,,,,


In [22]:
# Recommendations
def get_similarity_scores(selected_text_embedding, text_embeddings):
    similarity_scores = cosine_similarity(selected_text_embedding.reshape(1, -1), text_embeddings)
    return similarity_scores

In [23]:
print('Museum Pendidikan Nasional' in df['Place_Name'].values)

True


In [24]:
import pandas as pd

# Initialize an empty DataFrame
similarities_df = pd.DataFrame(columns=['Place', 'Similar Place', 'Score'])

# Iterate through each destination
for selected_place in df['Place_Name']:
    selected_place_index = df.index[df['Place_Name'] == selected_place].tolist()[0]
    selected_place_embedding = text_embeddings[selected_place_index]
    
    similarity_scores = get_similarity_scores(selected_place_embedding, text_embeddings)
    sorted_similarities = sorted(list(enumerate(similarity_scores[0])), key=lambda x: x[1], reverse=True)
    
    # Accumulate the top similar places in a list of dictionaries
    rows_to_add = []
    for idx, score in sorted_similarities:
        if idx != selected_place_index:  # Exclude the place itself from its similarity list
            similar_place = df.loc[idx, 'Place_Name']
            rows_to_add.append({'Place': selected_place, 'Similar Place': similar_place, 'Score': score})
    
    # Convert the list of dictionaries to a DataFrame and concatenate
    similarities_df = pd.concat([similarities_df, pd.DataFrame(rows_to_add)], ignore_index=True)

# Display the DataFrame
print(similarities_df)

# Optionally, save to a CSV file
similarities_df.to_csv('similarities.csv', index=False)


  similarities_df = pd.concat([similarities_df, pd.DataFrame(rows_to_add)], ignore_index=True)


                                            Place  \
0                                Monumen Nasional   
1                                Monumen Nasional   
2                                Monumen Nasional   
3                                Monumen Nasional   
4                                Monumen Nasional   
...                                           ...   
190527  Gereja Perawan Maria Tak Berdosa Surabaya   
190528  Gereja Perawan Maria Tak Berdosa Surabaya   
190529  Gereja Perawan Maria Tak Berdosa Surabaya   
190530  Gereja Perawan Maria Tak Berdosa Surabaya   
190531  Gereja Perawan Maria Tak Berdosa Surabaya   

                               Similar Place     Score  
0                        Jembatan Kota Intan  0.822475  
1       Monumen Perjuangan Rakyat Jawa Barat  0.818451  
2                 Sindu Kusuma Edupark (SKE)  0.813571  
3           Istana Negara Republik Indonesia  0.808312  
4                        Kawasan Kuliner BSM  0.803506  
...                  

In [25]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
# Enable Select TensorFlow ops
converter.target_spec.supported_ops = [
    tf.lite.OpsSet.TFLITE_BUILTINS,  # TFLite built-in ops.
    tf.lite.OpsSet.SELECT_TF_OPS     # Select TensorFlow ops.
]

# Disable experimental lowering of TensorList ops
converter._experimental_lower_tensor_list_ops = False

# Convert the model
try:
    tflite_model = converter.convert()
    # Save the TFLite model
    with open('model.tflite', 'wb') as f:
        f.write(tflite_model)
    print("Model conversion successful!")
except Exception as e:
    print("Model conversion failed:", e)
# tflite_model = converter.convert()

INFO:tensorflow:Assets written to: C:\Users\asus\AppData\Local\Temp\tmpd4atpuga\assets


INFO:tensorflow:Assets written to: C:\Users\asus\AppData\Local\Temp\tmpd4atpuga\assets


Model conversion successful!


In [27]:
# Save the tokenizer
import json

tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w') as f:
    f.write(tokenizer_json)