In [40]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sortedcontainers import SortedList
from sklearn.model_selection import train_test_split
import math
from datetime import datetime
import re



In [41]:
df = pd.read_csv('../dataset/test.tours.csv')
df = shuffle(df)

In [42]:
for i in range(len(df)):
        period = df['period'][i]
        try:
            days = int(period.split('ngày')[0])
        except:
            days = 0
        try:
            nights = int(period.split('đêm')[0])
        except:
            nights = 0
        total_hours = (days + nights) * 12
        df.at[i, 'period'] = total_hours

In [43]:
df.head(1)

Unnamed: 0,_id,name,description,price,departureLocation,period,images,destination,departureDay,createdAt,__v,numLikes
34,65db0bd40b1aa37831a6b49b,Du lịch Đông Bắc mùa Xuân - Hà Nội - Hà Giang ...,Hành trình Đông Bắc: Hà Giang - Quản Bạ - Đồng...,8199000,Hồ Chí Minh,48,//dulichviet.com.vn/images/bandidau/NOI-DIA/So...,Hà Giang - Quản Bạ - Đồng Văn - Lũng Cú - Mèo ...,2024-05-22T00:00:00.000Z,2024-02-25T09:43:48.446Z,0.0,


In [44]:
tours = {}
for row in df.values:
  _id, name,description, price, departureLocation, period, images,destination,departureDay,createdAt,__v,numLikes = row
  tours[_id] = {
      'id': _id,
      'name': name,
      'description': description,
      'price': price,
      'departureLocation': departureLocation,
      'period': period,
      'images': images,
      'destination':destination,
      'departureDay': departureDay,
      'createdAt':createdAt,
      '__v': __v,
      'numLikes':numLikes
  }

In [45]:
def get_sim_name(tour_id_i,tour_id_j):
    name_i = tours[tour_id_i]['name']
    name_j = tours[tour_id_j]['name']

   
    if(name_i == name_j):
        return 0.2

    return 0

def get_sim_departureLocation(tour_id_i,tour_id_j):
    departureLocation_i = set(tours[tour_id_i]['departureLocation'])
    departureLocation_j = set(tours[tour_id_j]['departureLocation'])
    if(departureLocation_i == departureLocation_j):
        return 0.2
    return  0

def get_sim_price(tour_id_i,tour_id_j):
    price_i = tours[tour_id_i]['price']
    price_j = tours[tour_id_j]['price']
    if price_i == 0 or price_j == 0:
            return 0
    max_price = max(price_i, price_j)

    min_price = min(price_i, price_j)

    similarity = abs(min_price / max_price)
    return similarity * 0.2

def get_sim_period(tour_id_i,tour_id_j):
    period_i = tours[tour_id_i]['period']
    period_j = tours[tour_id_j]['period']
    diff = abs(period_i - period_j)

    if diff <= 12:
        return 0.2
    elif diff <= 24:
        return 0.2 * 0.8
    elif diff <= 48:
        return 0.2 * 0.6
    return 0

def get_sim_departureDay(tour_id_i,tour_id_j):
    departure_day_i = datetime.fromisoformat(tours[tour_id_i]['departureDay'].split('T')[0])
    departure_day_j = datetime.fromisoformat(tours[tour_id_j]['departureDay'].split('T')[0])

    similarity = 1 / (1 + abs((departure_day_i - departure_day_j).days))

    return similarity * 0.2

def get_tour_similarities(tour_id_i,tour_id_j):
    sim_name = get_sim_name(tour_id_i,tour_id_j)
    sim_departureLocation = get_sim_departureLocation(tour_id_i,tour_id_j)
    sim_price = get_sim_price(tour_id_i,tour_id_j)
    sim_period = get_sim_period(tour_id_i,tour_id_j)
    sim_departureDay = get_sim_departureDay(tour_id_i,tour_id_j)

    return sim_name + sim_departureLocation + sim_price + sim_period + sim_departureDay

In [51]:
# get tour similarities15680000
tour_id = '65cc76cf4da78f7cdfcc4f5b'
k = 10
list = SortedList()
for id in tours.keys():
  if(id == tour_id):
    continue
  sim = get_tour_similarities(tour_id,id)
  list.add((sim, tours[id]['id']))
  if len(list) > k:
    del list[0]
sorted_list = sorted(list, reverse=True)  # Sắp xếp danh sách theo độ tương tự giảm dần
top_k = sorted_list[:k]  # Lấy ra k phần tử đầu tiên
top_k

[(0.5658163265306122, '65ba0f8acfbe79273d91e0dc'),
 (0.5501167582417583, '65db0bda0b1aa37831a6b4bd'),
 (0.5455565252416756, '65db0bfc0b1aa37831a6b585'),
 (0.5388833487940631, '65db0bf60b1aa37831a6b563'),
 (0.48997764699939383, '65db0bf20b1aa37831a6b547'),
 (0.4887021367953122, '65db0bfb0b1aa37831a6b57d'),
 (0.4848756061830673, '65db0bf90b1aa37831a6b573'),
 (0.4834434013605442, '65db0bd00b1aa37831a6b485'),
 (0.48232458577490406, '65db0bf80b1aa37831a6b56d'),
 (0.48147683397683405, '65db0bdf0b1aa37831a6b4db')]