# Import library

In [None]:
#pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.0-cp312-cp312-win_amd64.whl.metadata (29 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.31.2-py3-none-any.whl.metadata (13 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading sentence_transformers-4.1.0

In [1]:
import matplotlib.pyplot as plt
import numpy as np

# chuyển văn bản sang vector
from sentence_transformers import SentenceTransformer, util

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# giảm chiều vector để trực quan hóa
from sklearn.decomposition import PCA
from unidecode import unidecode
import re
from datetime import datetime

# tính độ tương đồng consine
from sklearn.metrics.pairwise import linear_kernel

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Read data

In [None]:
# đọc dữ liệu
new_hotels = pd.read_csv('../hotels_data_final.csv')
new_tourists = pd.read_csv('../tourist_dataset_10k.csv')

# BERT (nhập vào giá trị mong muốn)
nhận giá trị location, input text, checkin_range, checkout_range, min_budget, max_budget

### Các hàm xử lý

In [3]:
# xử lý chuỗi
def preprocess_text(text):
    if pd.isnull(text):
        return ''
    return unidecode(text.lower())  # bỏ dấu, viết thường

# xử lý giá hotels
def parse_price(price_str):
    try:
        price = price_str.replace(".", "").replace(" VND", "").strip()
        return int(price)
    except:
        return None

# xử lý khoảng giá tourist
def parse_budget_range(budget_str):
    try:
        parts = budget_str.replace(" VND", "").split("-")
        min_budget = int(parts[0].replace(".", "").strip())
        max_budget = int(parts[1].replace(".", "").strip())
        return min_budget, max_budget
    except:
        return None, None

# chuẩn hóa thời gian
def parse_time_range(text):
    text = text.lower().strip()
    
    if 'phục vụ 24h' in text:
        return (0, 24)

    time_pattern = r'(\d{1,2}:\d{2})'

    times = re.findall(time_pattern, text)
    times = [datetime.strptime(t, "%H:%M").hour for t in times]

    if 'từ' in text and 'đến' not in text and len(times) == 1:
        return (times[0], 24)
    elif 'đến' in text and 'từ' not in text and len(times) == 1:
        return (0, times[0])
    elif len(times) == 2:
        if (times[0] == times[1]): return (0, 24)
        return (times[0], times[1])
    else:
        return (0, 24)

# xử lý khoảng thời gian    
def time_ranges_overlap(range1, range2):
    start1, end1 = range1
    start2, end2 = range2
    return max(start1, start2) < min(end1, end2)

## Address & Popular Facilities & Rating

In [4]:
import torch, os
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
# chạy cell này 1 lần duy nhất
def encode_hotels(new_hotels, model):
    def generate_hotel_text(row):
        rating = row['Overall Rating']
        if rating >= 9:
            rating_desc = 'đánh giá xuất sắc'
        elif rating >= 8:
            rating_desc = 'đánh giá tốt'
        elif rating >= 6:
            rating_desc = 'đánh giá trung bình'
        else:
            rating_desc = 'đánh giá kém'

        # Ghép các trường text quan trọng + rating mô tả
        text = f"{row['Province']} {row['Hotel Name']} {row['Address']} " \
            f"{row['Popular Facilities']} có {rating_desc} {rating}"
        return text

    # Áp dụng lên toàn bộ DataFrame
    texts = new_hotels.apply(generate_hotel_text, axis=1)
    
    # Encode thành embedding tensor
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings

hotel_embeddings = encode_hotels(new_hotels, model)

def save_embeddings(file_path='hotel_embeddings.npy'):
    np.save(file_path, hotel_embeddings.cpu().numpy())

save_embeddings('hotel_embeddings.npy')

In [None]:
def load_embeddings(file_path='hotel_embeddings.npy'):
    if os.path.exists(file_path):
        embeddings = torch.tensor(np.load(file_path))
    else:
        embeddings = encode_hotels(new_hotels, model)
        np.save(file_path, embeddings.cpu().numpy())
    return embeddings

hotel_embeddings = load_embeddings('hotel_embeddings.npy')

In [None]:
def get_recommendations_bert(input_text, location, checkin_range, checkout_range, min_budget, max_budget, hotel_embeddings):
        if input_text.strip():  # Nếu có input_text, tính similarity
            # Encode yêu cầu người dùng
            input_text_embedding = model.encode(input_text, convert_to_tensor=True)
            cosine_scores = util.pytorch_cos_sim(input_text_embedding, hotel_embeddings)[0]
            cosine_scores_np = cosine_scores.cpu().numpy()
        else:
            cosine_scores_np = np.ones(len(new_hotels)) # similarity là 1 cho tất cả khách sạn

        matched_hotels = []

        for idx, row in new_hotels.iterrows():
            similarity = cosine_scores_np[idx]
            if similarity <= 0.5:
                continue  # bỏ những khách sạn tương đồng thấp

            # ======= LỌC SƠ BỘ =======

            # Lọc theo location
            if row['Province'].lower() != location.lower():
                continue

            # Lọc theo giá
            price = parse_price(row['Overview Price'])
            if price is None or not (min_budget <= price <= max_budget):
                continue

            # Lọc theo check-in time
            hotel_checkin = parse_time_range(row['Checkin Time'])
            if not time_ranges_overlap(hotel_checkin, checkin_range):
                continue

            # Lọc theo check-out time
            hotel_checkout = parse_time_range(row['Checkout Time'])
            if not time_ranges_overlap(hotel_checkout, checkout_range):
                continue

            # ======= GIỮ LẠI KẾT QUẢ =======
            row_copy = row.copy()
            row_copy['similarity_score'] = similarity
            matched_hotels.append(row_copy)

        # Chuyển kết quả thành DataFrame
        if matched_hotels:
            result_df = pd.DataFrame(matched_hotels)
            result_df = result_df.sort_values(by='similarity_score', ascending=False)
        else:
            result_df = pd.DataFrame(columns=list(new_hotels.columns) + ['similarity_score'])

        return result_df

In [8]:
# ví dụ mẫu gọi hàm get_recommendations_bert()
input_text = "Tôi cần tìm khách sạn ở quận 1 có đồ ăn sáng và hồ bơi, điểm đánh giá tốt 8.0"
location = 'hồ chí minh'
checkin_range = (00, 24)       # du khách muốn checkin từ 17:00 đến 20:00
checkout_range = (00, 24)      # du khách muốn checkout từ 10:00 đến 12:00
min_budget = 500000
max_budget = 50000000

df = get_recommendations_bert(input_text, location, checkin_range, checkout_range, min_budget, max_budget, hotel_embeddings)
df

Unnamed: 0,Hotel URL,Hotel Name,Overview Price,Address,Overall Rating,Staff,Facilities,Cleanliness,Comfort,Value for Money,Location,Free Wifi,Popular Facilities,Checkin Time,Checkout Time,Province,similarity_score
6971,https://www.booking.com/hotel/vn/rex.vi.html,Rex Hotel,4.171.758 VND,"141 Nguyen Hue Blvd, Quận 1, TP. Hồ Chí Minh...",8.1,8.7,8.1,8.6,8.7,7.9,9.6,8.3,"Hồ bơi ngoài trời, Xe đưa đón sân bay, Phòng...",Từ 14:00,Đến 12:00,Hồ Chí Minh,0.689248
6715,https://www.booking.com/hotel/vn/huong-sen.vi....,Huong Sen Hotel,1.457.938 VND,"66-68-70 Dong Khoi, Ben Nghe Ward, Quận 1, TP....",8.5,9.2,8.4,8.8,8.8,8.7,9.7,7.6,"Hồ bơi ngoài trời, Xe đưa đón sân bay, Phòng...",Từ 14:00,Đến 12:00,Hồ Chí Minh,0.677044
7169,https://www.booking.com/hotel/vn/kin-thi-sach....,Kin Hotel Thi Sach,2.461.070 VND,"11a Thi Sách, Quận 1, TP. Hồ Chí Minh, Việt...",8.6,9.4,8.5,8.9,9.1,8.4,9.2,8.1,"Hồ bơi trong nhà, Xe đưa đón sân bay, Phòng ...",Từ 14:00 - 23:30,Từ 00:30 - 12:00,Hồ Chí Minh,0.667037
7237,https://www.booking.com/hotel/vn/silverland-ch...,Au Lac Charner Hotel,2.837.698 VND,"87-89-91 Ho Tung Mau Street, Ben Nghe Ward, Qu...",8.8,9.4,8.8,9.2,9.2,8.6,9.5,9.3,"Hồ bơi ngoài trời, Xe đưa đón sân bay, Phòng...",Từ 14:00,Từ 11:30 - 12:00,Hồ Chí Minh,0.665479
6999,https://www.booking.com/hotel/vn/aem-corner-sa...,A&EM Saigon Hotel,2.713.288 VND,"39-41 Thu Khoa Huan Street, Ben Thanh Ward, Qu...",8.0,8.9,7.8,8.4,8.4,8.1,9.4,8.8,"Hồ bơi ngoài trời, Xe đưa đón sân bay, Phòng...",Từ 14:00,Đến 12:00,Hồ Chí Minh,0.663641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6476,https://www.booking.com/hotel/vn/new-land-apar...,NEW LAND Apartment 2 - Phu My Hung,572.670 VND,"35 Cao Triều Phát, Quận 7, TP. Hồ Chí Minh, ...",7.8,8.4,7.6,7.8,8.2,8.2,8.8,8.3,"Phòng không hút thuốc, WiFi miễn phí",Từ 14:00 - 22:00,Từ 06:00 - 12:00,Hồ Chí Minh,0.303108
7073,https://www.booking.com/hotel/vn/nice-village-...,NICE VILLAGE Nguyen Thi Minh Khai D1,675.000 VND,"Đường Nguyễn Thị Minh Khai, Quận 1, TP. Hồ Ch...",6.9,7.8,7.4,7.8,7.6,7.6,7.2,7.5,"WiFi miễn phí, Điều hòa nhiệt độ",Từ 14:00 - 23:00,Từ 05:00 - 12:00,Hồ Chí Minh,0.302943
7180,https://www.booking.com/hotel/vn/sweet-home-bi...,SwEEt HomE,1.069.462 VND,"31/57 Ung Văn Khiêm, Quận Bình Thạnh, 70000 TP...",6.8,7.9,7.5,7.9,8.3,7.9,7.9,8.0,"WiFi miễn phí, Chỗ đỗ xe miễn phí, Điều ...",Từ 12:00 - 23:00,Từ 11:00 - 12:00,Hồ Chí Minh,0.302364
6853,https://www.booking.com/hotel/vn/sai-gon-pavil...,Sai Gon Pavillon Bà Huyện Thanh Quan Quận 3,1.800.000 VND,"53 Bà Huyện Thanh Quan, Quận 3, TP. Hồ Chí M...",9.0,9.4,9.2,9.3,9.3,9.1,9.5,9.5,"Xe đưa đón sân bay, Phòng không hút thuốc,...",Từ 15:00 - 23:00,Từ 00:00 - 12:00,Hồ Chí Minh,0.300263


In [None]:
df.to_csv('recommendations_bert_1.csv', index=False, encoding='utf-8-sig')