In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import TFAutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import os

In [None]:
class TFSentenceTransformer(tf.keras.Model):
    def __init__(self, base_model, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.base_model = base_model

    @classmethod
    def from_pretrained(cls, model_name):
        """Memuat model dan tokenizer dari Hugging Face Hub."""
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        base_model = TFAutoModel.from_pretrained(model_name)
        return cls(base_model, tokenizer)

    @classmethod
    def from_local(cls, model_path):
        """Memuat model dan tokenizer dari direktori lokal."""
        tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
        base_model = TFAutoModel.from_pretrained(model_path, local_files_only=True)
        return cls(base_model, tokenizer)

    def save(self, save_path):
        """Menyimpan base_model dan tokenizer ke direktori."""
        self.base_model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)
        print(f"Model disimpan di: {save_path}")

    # Fungsi lainnya tetap sama
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = tf.cast(tf.expand_dims(attention_mask, -1), tf.float32)
        return tf.reduce_sum(token_embeddings * input_mask_expanded, axis=1) / tf.maximum(
            tf.reduce_sum(input_mask_expanded, axis=1), 1e-9
        )

    def encode(self, sentences):
        encoded = self.tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors='tf')
        # Tidak perlu GradientTape untuk inferensi/encoding
        model_output = self.base_model(
            input_ids=encoded['input_ids'],
            attention_mask=encoded['attention_mask'],
            training=False # Set ke False untuk inferensi
        )
        sentence_embeddings = self.mean_pooling(model_output, encoded['attention_mask'])
        sentence_embeddings = tf.nn.l2_normalize(sentence_embeddings, axis=1)
        return sentence_embeddings.numpy()

In [None]:
class TravelRecommendationChatbot:
    def __init__(self, model):
        self.model = model
        self.travel_data = None
        self.location_embeddings = None

    # Fungsi lainnya sebagian besar sama, hanya referensinya yang disesuaikan
    def load_travel_data(self, csv_path):
        print("Loading travel data...")
        self.travel_data = pd.read_csv(csv_path).dropna().reset_index(drop=True) # Tambah reset_index
        self.travel_data['combined_text'] = (
            self.travel_data['Name'] + ". " +
            self.travel_data['Description'] + ". " +
            self.travel_data['Categories'] + ". " +
            self.travel_data['Lokasi']
        )
        print(f"Loaded {len(self.travel_data)} travel locations")

    def generate_embeddings(self, batch_size=16):
        print("Generating embeddings for travel locations...")
        combined_texts = self.travel_data['combined_text'].tolist()
        self.location_embeddings = self.model.encode(combined_texts)
        print("Embeddings generated successfully!")

    # ... (get_recommendations, format_response, chat, dll. tetap sama)
    def get_recommendations(self, query, top_n=5):
        processed_query = query.lower().strip()
        query_embedding = self.model.encode([processed_query])
        similarities = cosine_similarity(query_embedding, self.location_embeddings)[0]
        top_indices = np.argsort(similarities)[::-1][:top_n]
        return [{
            'name': self.travel_data.iloc[idx]['Name'],
            'description': self.travel_data.iloc[idx]['Description'],
            'category': self.travel_data.iloc[idx]['Categories'],
            'location': self.travel_data.iloc[idx]['Lokasi'],
            'similarity_score': similarities[idx]
        } for idx in top_indices]

    def format_response(self, recommendations):
        if not recommendations:
            return "Maaf, saya tidak menemukan rekomendasi yang sesuai."
        response = "Berikut rekomendasi tempat wisata:\n\n"
        for i, rec in enumerate(recommendations, 1):
            desc = rec['description'][:200] + ('...' if len(rec['description']) > 200 else '')
            response += (
                f"{i}. **{rec['name']}** ({rec['location']})\n"
                f"   Kategori: {rec['category']}\n"
                f"   {desc}\n"
                f"   Skor Kesesuaian: {rec['similarity_score']:.3f}\n\n"
            )
        return response

    def chat(self, user_input):
        try:
            recommendations = self.get_recommendations(user_input)
            return self.format_response(recommendations)
        except Exception as e:
            return f"Terjadi kesalahan: {str(e)}"

In [None]:
def fine_tune_model(model, training_data, epochs=3):
    print("Fine-tuning model...")
    queries, locations, scores = zip(*training_data)
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss_fn = tf.keras.losses.MeanSquaredError()

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        # ... (Looping fine-tuning sama persis) ...
        for i in range(0, len(queries), 8):
            batch_q = list(queries[i:i+8])
            batch_l = list(locations[i:i+8])
            batch_s = tf.constant(scores[i:i+8], dtype=tf.float32)

            q_inputs = model.tokenizer(batch_q, padding=True, truncation=True, return_tensors="tf")
            l_inputs = model.tokenizer(batch_l, padding=True, truncation=True, return_tensors="tf")

            with tf.GradientTape() as tape:
                q_emb = model.mean_pooling(model.base_model(**q_inputs), q_inputs['attention_mask'])
                l_emb = model.mean_pooling(model.base_model(**l_inputs), l_inputs['attention_mask'])
                q_emb = tf.nn.l2_normalize(q_emb, axis=1)
                l_emb = tf.nn.l2_normalize(l_emb, axis=1)
                similarity = tf.reduce_sum(q_emb * l_emb, axis=1)
                loss = loss_fn(batch_s, similarity)

            grads = tape.gradient(loss, model.base_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.base_model.trainable_variables))
            print(f"Batch {i//8 + 1}, Loss: {loss.numpy():.4f}")
    print("Fine-tuning completed.")


In [None]:
def train_and_save_model():
    print("--- Memulai Sesi Pelatihan ---")

    # 1. Muat model dasar dari Hugging Face
    base_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    model = TFSentenceTransformer.from_pretrained(base_model_name)

    # 2. Muat data untuk digunakan dalam fine-tuning
    temp_data = pd.read_csv("cleaned_data_wisata.csv").dropna().reset_index()
    temp_data['combined_text'] = (
        temp_data['Name'] + ". " +
        temp_data['Description'] + ". " +
        temp_data['Categories'] + ". " +
        temp_data['Lokasi']
    )
    training_data = [
        ("tempat wisata yang sejuk dan alami", temp_data['combined_text'][0], 0.95),
        ("pantai yang indah dan bersih", temp_data['combined_text'][1], 0.92),
        ("tempat bersejarah dan edukatif", temp_data['combined_text'][2], 0.89),
        ("wisata religi menarik", temp_data['combined_text'][3], 0.93),
        ("air terjun yang menyejukkan", temp_data['combined_text'][4], 0.94)
    ]

    # 3. Lakukan fine-tuning
    fine_tune_model(model, training_data, epochs=5)

    # ✅ 4. Simpan model dan tokenizer
    fine_tuned_path = "./fine_tuned_model"
    model.save(fine_tuned_path)
    print("--- Sesi Pelatihan Selesai ---")
    return fine_tuned_path

In [None]:
def run_chatbot_from_local(model_path):
    print("\n--- Memulai Sesi Chatbot (Inferensi) ---")

    # 1. Muat model yang sudah di-fine-tune DARI DISK menggunakan custom class
    print(f"Memuat model dari: {model_path}")
    # Gunakan TFSentenceTransformer.from_local() untuk memuat model dan tokenizer
    model = TFSentenceTransformer.from_local(model_path)

    # 2. Inisialisasi chatbot dengan model yang sudah dimuat
    chatbot = TravelRecommendationChatbot(model)

    # 3. Muat data wisata dan generate embeddings
    chatbot.load_travel_data("cleaned_data_wisata.csv")
    chatbot.generate_embeddings()

    # 4. Jalankan mode interaktif
    print("\n🏝️ Travel Chatbot Siap! (Ketik 'keluar' untuk berhenti)\n")
    while True:
        user_input = input("Anda: ")
        if user_input.lower() in ['keluar', 'exit', 'quit']:
            print("Terima kasih! Sampai jumpa 🌴")
            break
        print("\nChatbot:", chatbot.chat(user_input))
        print("-" * 50)

In [None]:
fine_tuned_model_path = train_and_save_model()
run_chatbot_from_local(fine_tuned_model_path)

--- Memulai Sesi Pelatihan ---


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Fine-tuning model...
Epoch 1/5




Batch 1, Loss: 0.3975
Epoch 2/5
Batch 1, Loss: 0.2335
Epoch 3/5
Batch 1, Loss: 0.1298
Epoch 4/5
Batch 1, Loss: 0.0721
Epoch 5/5
Batch 1, Loss: 0.0358
Fine-tuning completed.
Model disimpan di: ./fine_tuned_model
--- Sesi Pelatihan Selesai ---

--- Memulai Sesi Chatbot (Inferensi) ---
Memuat model dari: ./fine_tuned_model


All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ./fine_tuned_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Loading travel data...
Loaded 50 travel locations
Generating embeddings for travel locations...
Embeddings generated successfully!

🏝️ Travel Chatbot Siap! (Ketik 'keluar' untuk berhenti)

Anda: rekomendasi tempat wisata gunung

Chatbot: Berikut rekomendasi tempat wisata:

1. **Kawah Ijen** (Banyuwangi)
   Kategori: Alam
   Kawah Ijen terkenal dengan fenomena "blue fire"-nya yang langka dan danau asam berwarna toska yang menakjubkan. Pendakian malam menjadi favorit bagi para petualang. Terletak di perbatasan Banyuwangi d...
   Skor Kesesuaian: 0.818

2. **Coban Rondo** (Kabupaten Malang)
   Kategori: Air Terjun
   Air terjun alami yang mudah diakses dengan pemandangan tropis yang asri dan menenangkan. Terletak di kawasan Pujon, air terjun ini dilengkapi dengan area duduk dan bangku yang nyaman, cocok untuk pikn...
   Skor Kesesuaian: 0.780

3. **Coban Pelangi** (Gubukklakah)
   Kategori: Air Terjun
   Air terjun besar yang dapat dicapai dengan jalur hiking melalui hutan yang rindang da

KeyboardInterrupt: Interrupted by user