# Import libraries

In [1]:
import pandas as pd
import numpy as np
import random

!pip install "pymongo[srv]"
from pymongo import MongoClient

connection_string = "mongodb+srv://giabao:lenguyengiabao@datewise.hxajp.mongodb.net/?retryWrites=true&w=majority&appName=DateWise"
client = MongoClient(connection_string)
db_name = 'AppData'

Collecting pymongo[srv]
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
[0mCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


# Define functions

In [7]:
def get_collection(collection_name: str) -> list[dict]:
    """
    Get all documents in a collection.
    """
    db = client[db_name]
    return list(db[collection_name].find())

def load_location_dict() -> dict:
    """
    Load location dictionary.
    """
    location_list = get_collection('LOCATION')
    location_dict = {}
    for location in location_list:
        location_dict[str(location['_id'])] = {
            'district':str(location['LOC_DISTRICT']),
            'tag':str(location['LOC_TAG']),
            'price':int(location['LOC_PRICE'])
        }
    return location_dict

def load_tag_dict() -> dict:
    """
    Load tag dictionary.
    """
    tag_list = get_collection('TAG')
    tag_dict = {}
    for tag in tag_list:
        tag_dict[str(tag['_id'])] = str(tag['TAG_TYPE'])
    return tag_dict

def load_avg_spending_time_dict() -> dict:
    """
    Load average spending time dictionary.
    """
    avg_spending_time_list = get_collection('AVERAGE_SPENDING_TIME')
    avg_spending_time_dict = {}
    for avg_spending_time in avg_spending_time_list:
        avg_spending_time_dict[str(avg_spending_time['TYPE_NAME'])] = float(avg_spending_time['TYPE_AVGHOUR'])
    return avg_spending_time_dict

def load_plan_dict() -> dict:
    """
    Load plan dictionary.
    """
    plan_list = get_collection('PLAN')
    plan_dict = {}
    for plan in plan_list:
        plan_dict[str(plan['_id'])] = {
            'district': str(plan['PLAN_DISTRICT']),
            'budget': int(plan['PLAN_MAXBUDGET']),
            'start_time': float(plan['PLAN_STARTTIME']),
            'end_time': float(plan['PLAN_ENDTIME']),
            'cuisines': str(plan['PLAN_CUISINES']),
            'mcourses': str(plan['PLAN_MCOURSES']),
            'desserts': str(plan['PLAN_DESSERTS']),
            'activities': str(plan['PLAN_ACTIVITES'])
        }
    return plan_dict

''''''
def checkin_time(hours: float, starttime: float, endtime: float) -> bool:
    """
    Check time (hours) in a range [start time, end time].
    """
    return hours >= starttime and hours <= endtime

def checkin_timerange(starthours: float, endhours: float, starttime: float, endtime: float, eps:float=0.1) -> bool:
    """
    Check time range [start hours, end hours] in a range (start time, end time).
    """
    return checkin_time(starthours, starttime, endtime - eps) or checkin_time(endhours, starttime + eps, endtime)

def checkin_lunch_time(time: float, start_lunch_time: int=11, end_lunch_time: int=13) -> bool:
    """
    Check time in lunch time (default: 11h - 13h).
    """
    return checkin_time(time, start_lunch_time, end_lunch_time)

def checkin_dinner_time(time: float, start_dinner_time: int=17, end_dinner_time: int=19) -> bool:
    """
    Check time in dinner time (default: 17h - 19h).
    """
    return checkin_time(time, start_dinner_time, end_dinner_time)

def split_string(string, length=7) -> list[str]:
    """
    Slipt a long string to a list of fixed length strings.
    """
    return [string[i:i+length] for i in range(0, len(string), length)]

''''''
def create_location_dataset() -> dict:
    """
    Create location dataset combined by location, tag and avg spending time dictonaries.
    """
    location_dict = load_location_dict()
    tag_dict = load_tag_dict()
    avg_spending_time_dict = load_avg_spending_time_dict()
    location_dataset = {}
    for location_id, location_info in location_dict.items():
        location_dataset[location_id] = {
            'district': location_info['district'],
            'tag': location_info['tag'],
            'price': location_info['price'],
            'type': tag_dict[location_info['tag']],
            'avg_spending_time': avg_spending_time_dict[tag_dict[location_info['tag']]]
        }
    return location_dataset

def create_plan_dataset(location_dataset: dict, avg_time_per_location: float=1.5) -> dict:
    """
    Create plan dataset combined by plan and location dictionaries.
    """
    plan_dict = load_plan_dict()
    plan_dataset = {}
    for plan_id, plan_info in plan_dict.items():
        lunch_time = checkin_lunch_time(plan_info['start_time']) or checkin_lunch_time(plan_info['end_time'])
        dinner_time = checkin_dinner_time(plan_info['start_time']) or checkin_dinner_time(plan_info['end_time'])

        max_locs_num = int((plan_info['end_time'] - plan_info['start_time']) / avg_time_per_location)
        restaurants_num = lunch_time + dinner_time

        district_locations = [loc for loc in list(location_dataset.keys()) if location_dataset[loc]['district'] == plan_info['district']]

        cuisines = split_string(plan_info['cuisines'], len('CUIS-01'))
        mcourses = split_string(plan_info['mcourses'], len('MCOU-01'))
        desserts = split_string(plan_info['desserts'], len('DEDR-01'))
        activities = split_string(plan_info['activities'], len('ACTI-01'))

        restaurants = [loc for loc in district_locations if location_dataset[loc]['tag'] in cuisines or location_dataset[loc]['tag'] in mcourses]
        # pick more other restaurants if not enough
        if len(restaurants) < restaurants_num:
            for loc in district_locations:
                if location_dataset[loc]['type']=='Restaurant':
                    restaurants.append(loc)

        others = [loc for loc in district_locations if location_dataset[loc]['tag'] in desserts or location_dataset[loc]['tag'] in activities]

        plan_dataset[plan_id] = {
            'budget': plan_info['budget'],
            'start_time': plan_info['start_time'],
            'end_time': plan_info['end_time'],
            'lunch_time': lunch_time,
            'dinner_time': dinner_time,
            'max_others_num': max_locs_num - restaurants_num,
            'restaurants': restaurants,
            'others': others,
        }
    return plan_dataset

''''''
def calculate_budget_distance(plan_locs: list[str], input_budget: int, location_dataset: dict, prob_threshold: float=0.2) -> int:
    """
    Calculate budget distance due to input budget and list of location ids of a plan.
    """
    prices = [location_dataset[loc]['price'] for loc in plan_locs if loc in location_dataset.keys()]
    prob_budget_distance = np.abs(input_budget - np.sum(prices)) / input_budget

    # assign -1 if over the threshold
    return prob_budget_distance if prob_budget_distance <= prob_threshold else -1

def calculate_time_distance(plan_locs: list[str], input_hours_sum: float, location_dataset: dict, threshold: float=0) -> float:
    """
    Calculate time distance due to input hours sum and list of location ids of a plan.
    """
    avg_hours = [location_dataset[loc]['avg_spending_time'] for loc in plan_locs if loc in location_dataset.keys()]
    time_distance = input_hours_sum - np.sum(avg_hours)

    # assign -1 if over the threshold
    return time_distance / input_hours_sum if time_distance >= threshold else -1

def calculate_overall_distance(plan_locs: list[str], input_budget: int, input_hours_sum: float, location_dataset: dict, budget_time_ratio: float=0.5, budget_prob_threshold: float=0.2, time_threshold: float=0) -> float:
    """
    Calculate overall distance due to input budget, hours sum and list of location ids of a plan.
    """
    budget_distance = calculate_budget_distance(plan_locs, input_budget, location_dataset, budget_prob_threshold)
    time_distance = calculate_time_distance(plan_locs, input_hours_sum, location_dataset, time_threshold)
    if budget_distance == -1 or time_distance == -1:
        return -1

    return budget_distance * budget_time_ratio + time_distance * (1 - budget_time_ratio)

''''''
def pick_randoms(lst: list[str], elements_num: int) -> list[str]:
    """
    Randomly pick some unique elements in a string list (id list).
    """
    return random.sample(lst, elements_num)

def softmax(scores_lst: np.ndarray, lamda: float=-1.0) -> np.ndarray:
    """
    Compute softmax values for each score in a list with lamda.
    x -> softmax(x) = exp(lamda * x) / sum(exp(lamda * x))
    """
    return np.exp(lamda * scores_lst) / np.sum(np.exp(lamda * scores_lst))

def generate_plan_pool(plan_id: str, plan_dataset: dict, location_dataset: dict, max_pool_size: int=10, budget_time_ratio: float=0.5, budget_prob_threshold: float=0.2, time_threshold: int=0) -> list[tuple[list, np.ndarray]]:
    """
    Generate plan pool.
    """
    if plan_id not in plan_dataset.keys():
        return None
    plan_info = plan_dataset[plan_id]

    restaurants_num = plan_info['lunch_time'] + plan_info['dinner_time']
    max_iterations = plan_info['max_others_num']

    plan_locs_lst = []
    overall_distance_lst = []
    for n in range(max_pool_size):
        num_restaurants_to_pick = min(restaurants_num, len(plan_info['restaurants']))
        picked_restaurants = pick_randoms(plan_info['restaurants'], num_restaurants_to_pick)

        num_others_to_pick = min(plan_info['max_others_num'], len(plan_info['others']))
        others = pick_randoms(plan_info['others'], num_others_to_pick)
        picked_others = others

        plan_locs = picked_restaurants + picked_others

        hours_sum = plan_info['end_time'] - plan_info['start_time']

        overall_distance = calculate_overall_distance(plan_locs, plan_info['budget'], hours_sum, location_dataset, budget_time_ratio, budget_prob_threshold, time_threshold)
        if overall_distance < 0:
            for others_num in range(plan_info['max_others_num'] - 1, 0, -1):
                for _ in range(max_iterations):
                    num_others_to_pick = min(others_num, len(others))
                    picked_others = pick_randoms(others, num_others_to_pick)

                    plan_locs = picked_restaurants + picked_others
                    overall_distance = calculate_overall_distance(plan_locs, plan_info['budget'], hours_sum, location_dataset, budget_time_ratio, budget_prob_threshold, time_threshold)

                    if overall_distance >= 0: break
                if overall_distance >= 0: break
            if overall_distance < 0:
                plan_locs = picked_restaurants
                overall_distance = calculate_overall_distance(plan_locs, plan_info['budget'], hours_sum, location_dataset, budget_time_ratio, budget_prob_threshold, time_threshold)

        if (overall_distance >= 0):
            plan_locs_lst.append(plan_locs)
            overall_distance_lst.append(overall_distance)

    # Calculate probability of each plan by softmax function
    plan_probabilities = softmax(np.array(overall_distance_lst))

    # Create plan pool contains pairs (locations, probability) of a picked plan
    plan_pool = list(zip(plan_locs_lst, plan_probabilities))
    return plan_pool

''''''
def weighted_randomly_pick(elements_with_probabilities: list):
    """
    Randomly picks an element from a list of elements with associated probabilities.
    """
    r = random.random()
    accumulator = 0
    for element, prob in elements_with_probabilities:
      accumulator += prob
      if accumulator >= r:
        return element
    return elements_with_probabilities[-1][0]  # Return the last element as a fallback

''''''
def fill_zeros_str(number_str: str, n_digits: int) -> str:
    """
    Fill zeros before a string to get str with n_digits (e.g. "9", digits_num = 2 -> "09")
    """
    filled_zeros = n_digits - len(number_str)

    return ('0' * filled_zeros + number_str) if filled_zeros > 0 else number_str

def convert_float_hours_to_str(float_hours: float) -> str:
    """
    Convert float hours (e.g. 17.5) to str hours (e.g. "17:30").
    """
    hours = int(float_hours)

    MAX_HOUR_DIGITS = 2
    str_hours = fill_zeros_str(str(hours), MAX_HOUR_DIGITS)

    MINS_PER_HOUR = 60
    minutes = int((float_hours - hours) * MINS_PER_HOUR)

    MAX_MINUTE_DIGITS = 2
    str_minutes = fill_zeros_str(str(minutes), MAX_MINUTE_DIGITS)

    return str_hours + ':' + str_minutes

def convert_str_time_to_float(str_time: str) -> float:
    """
    Convert str hours (e.g. "17:30") to float hours (e.g. 17.5).
    """
    lst_time = str_time.split(':')
    str_hours, str_minutes = lst_time[0], lst_time[1]

    MINS_PER_HOUR = 60
    return int(str_hours) + int(str_minutes) / MINS_PER_HOUR

''''''
def generate_timepoints(starttime: float, endtime: float, get_end: bool=False, step: float=0.5) -> list[float]:
    """
    Generate timepoints in a range [starttime, endtime].
    """
    timepoints_num = int((endtime - starttime) / step) + get_end
    return [(starttime + i * step) for i in range(timepoints_num)]

def mark_timeline(loc: str, time: float, timeline: dict) -> dict:
    """
    Mark a location (id) at a timepoint in a timeline.
    """
    if timeline[convert_float_hours_to_str(time)] is not None:
        return None
    timeline[convert_float_hours_to_str(time)] = loc
    return timeline

def fill_timeline(loc: str, fromtime: float, timeline: dict, location_dataset: dict) -> dict:
    """
    Fill a location (id) in a timeline.
    """
    totime = fromtime + location_dataset[loc]['avg_spending_time']
    latest_time = list(timeline.keys())[-1]
    hours_step = 0.5

    if totime > convert_str_time_to_float(latest_time) + hours_step:
        return None

    timepoints = generate_timepoints(fromtime, totime, get_end=False, step=hours_step)
    for timepoint in timepoints:
        tmp_timeline = mark_timeline(loc, timepoint, timeline)
        if tmp_timeline is None:
            return None
        timeline = tmp_timeline
    return timeline

''''''
def pick_a_loc(locs: list, location_dataset: dict, loctype: str=None) -> str:
    """
    Pick a location (id) in a list of locations.
    """
    if len(locs) == 0:
        return None

    random.shuffle(locs)  # Use random.shuffle(locs) instead of locs.shuffle()
    if loctype is None:
        return locs[0]

    for loc in locs:
        if location_dataset[loc]['type'] == loctype:
            return loc
    return None

def limit_timerange(starttime: float, endtime: float, timerange: list[float]) -> list[float]:
    """
    Limit timerange to [starttime, endtime].
    """
    for time in timerange:
        if time < starttime or time > endtime:
            timerange.remove(time)
    return timerange

''''''
def schedule_plan(plan_locs: list[str], timeline: dict, is_lunch_time: bool, is_dinner_time: bool, starttime: float, endtime: float, location_dataset: dict) -> dict:
    """
    Schedule a plan in a timeline.
    """
    if is_lunch_time or is_dinner_time:
        # Pick a restaurant
        restaurant = pick_a_loc(plan_locs, location_dataset, 'Restaurant')
        if restaurant is None:
            return None

        tmp_plan_locs = plan_locs.copy()
        tmp_plan_locs.remove(restaurant)

        if is_lunch_time:
            lunch_timerange = limit_timerange(starttime, endtime - location_dataset[restaurant]['avg_spending_time'], generate_timepoints(starttime=11, endtime=13, get_end=True))
            for lunch_time in lunch_timerange:
                tmp_timeline = fill_timeline(restaurant, lunch_time, timeline, location_dataset)
                if tmp_timeline is not None:
                    tmp_timeline = schedule_plan(tmp_plan_locs, tmp_timeline, False, is_dinner_time, starttime, endtime, location_dataset)
                    if tmp_timeline is not None:
                        return tmp_timeline

        if is_dinner_time:
            dinner_timerange = limit_timerange(starttime, endtime - location_dataset[restaurant]['avg_spending_time'], generate_timepoints(starttime=17, endtime=19, get_end=True))
            for dinner_time in dinner_timerange:
                tmp_timeline = fill_timeline(restaurant, dinner_time, timeline, location_dataset)
                if tmp_timeline is not None:
                    tmp_timeline = schedule_plan(tmp_plan_locs, tmp_timeline, is_lunch_time, False, starttime, endtime, location_dataset)
                    if tmp_timeline is not None:
                        return tmp_timeline

    # Pick a location
    location = pick_a_loc(plan_locs, location_dataset, None)
    if location is None:
        return timeline

    plan_locs.remove(location)

    free_timerange = [convert_str_time_to_float(time) for time in timeline.keys() if timeline[time] is None]
    for free_time in free_timerange:
        tmp_timeline = fill_timeline(location, free_time, timeline, location_dataset)
        if tmp_timeline is not None:
            tmp_timeline = schedule_plan(plan_locs, tmp_timeline, is_lunch_time, is_dinner_time, starttime, endtime, location_dataset)
            if tmp_timeline is not None:
                return tmp_timeline
    return None

def remove_duplicate_locs(timeline: dict) -> dict:
    """
    Removes timepoints in time line with the same 'loc' value, keeping only the first occurrence.
    """
    seen_locs = set()
    new_timeline = {}
    for time, loc in timeline.items():
        if loc not in seen_locs:
            seen_locs.add(loc)
            new_timeline[time] = loc
    return new_timeline

def generate_plan_detail(plan_id: str, plan_pool: list[tuple[list, np.ndarray]], plan_dataset: dict, location_dataset: dict) -> list[dict]:
    """
    Generate a plan detail.
    """
    # Step 1: Pick weighted random a list of locations in plan pool
    plan_locs = weighted_randomly_pick(plan_pool)

    # Step 2: Fill locations into timeline
    starttime, endtime = plan_dataset[plan_id]['start_time'], plan_dataset[plan_id]['end_time']
    timerange = generate_timepoints(starttime, endtime)

    timeline = {}
    for time in timerange:
        timeline[convert_float_hours_to_str(time)] = None

    is_lunch_time = checkin_timerange(starttime, endtime, 11, 13)
    is_dinner_time = checkin_timerange(starttime, endtime, 17, 19)

    timeline = schedule_plan(plan_locs, timeline, plan_dataset[plan_id]['lunch_time'], plan_dataset[plan_id]['dinner_time'], starttime, endtime, location_dataset)

    # Step 3: Convert into list[dict] plan detail
    plan_detail = []
    timeline = remove_duplicate_locs(timeline)
    for time, loc in timeline.items():
        if loc is not None:
            plan_detail.append({
                'DETAIL_ID': plan_id,
                'DETAIL_TIME': time,
                'DETAIL_LOC': loc
            })
    return plan_detail

# Parameters

In [3]:
avg_time_per_location=1.5

max_pool_size = 10
budget_time_ratio = 0.5
budget_prob_threshold = 0.2
time_threshold = 0

# Load dataset

In [4]:
location_dataset = create_location_dataset()
print(location_dataset)

plan_dataset = create_plan_dataset(location_dataset, avg_time_per_location)
print(plan_dataset)

{'BCH-001': {'district': 'Binh Chanh', 'tag': 'ACTI-05', 'price': 200, 'type': 'Sport', 'avg_spending_time': 3.0}, 'BTH-001': {'district': 'Binh Thanh', 'tag': 'DEDR-02', 'price': 400, 'type': 'Bar', 'avg_spending_time': 2.0}, 'BTH-013': {'district': 'Binh Thanh', 'tag': 'ACTI-07', 'price': 0, 'type': 'Religion', 'avg_spending_time': 1.5}, 'BTH-014': {'district': 'Binh Thanh', 'tag': 'ACTI-08', 'price': 0, 'type': 'Library', 'avg_spending_time': 1.0}, 'CGI-003': {'district': 'Can Gio', 'tag': 'DEDR-01', 'price': 400, 'type': 'Cafe', 'avg_spending_time': 1.0}, 'CGI-013': {'district': 'Can Gio', 'tag': 'ACTI-03', 'price': 200, 'type': 'Outdoor', 'avg_spending_time': 2.0}, 'CGI-018': {'district': 'Can Gio', 'tag': 'ACTI-09', 'price': 400, 'type': 'Shopping', 'avg_spending_time': 1.0}, 'CGI-028': {'district': 'Can Gio', 'tag': 'MCOU-12', 'price': 400, 'type': 'Restaurant', 'avg_spending_time': 1.5}, 'CCH-009': {'district': 'Cu Chi', 'tag': 'CUIS-06', 'price': 400, 'type': 'Restaurant', 'av

# Generate plan pool

In [5]:
plan_id = list(plan_dataset.keys())[0]
print(plan_id)
plan_pool = generate_plan_pool(plan_id, plan_dataset, location_dataset, max_pool_size, budget_time_ratio, budget_prob_threshold, time_threshold)
print(plan_pool)

241112-001030
[(['D01-102', 'D01-027'], 0.10857360321578116), (['D01-108', 'D01-025'], 0.11999238874476595), (['D01-136', 'D01-009'], 0.10857360321578116), (['D01-193', 'D01-015'], 0.10857360321578116), (['D01-189', 'D01-009'], 0.10857360321578116), (['D01-186', 'D01-017'], 0.10857360321578116), (['D01-182', 'D01-037'], 0.10857360321578116), (['D01-124', 'D01-006'], 0.11999238874476595), (['D01-122', 'D01-044'], 0.10857360321578116)]


# Generate/Re-generate plan detail

In [8]:
plan_detail = generate_plan_detail(plan_id, plan_pool, plan_dataset, location_dataset)
print(plan_detail)

[{'DETAIL_ID': '241112-001030', 'DETAIL_TIME': '17:30', 'DETAIL_LOC': 'D01-102'}, {'DETAIL_ID': '241112-001030', 'DETAIL_TIME': '19:00', 'DETAIL_LOC': 'D01-027'}]
