In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import pycountry
from datetime import datetime

import tkinter as tk
from tkinter import messagebox

In [None]:
current_date = datetime.now()
current_date.date()

In [20]:
# Construct City_country dataframe

# Create a mapping function for country abbreviations
def get_country_abbreviation(country_name):
    try:
         # use pycountry to get the country abbreviation
        country =  pycountry.countries.lookup(country_name)
        return country.alpha_2
    except LookupError:
         # If not found, return NaN
            # If not found, return NaN and print the country name
        #print(f"Country not found: {country_name}")
        return np.nan

# Get city_country dataframe
def city_country_dataframe():
    # load city, country dataframe
    city_country_df = pd.read_csv("worldcities.csv")

    # only select four colums:
    city_country_df = city_country_df[["city","lat","lng","country"]]

    # renmae column name
    city_country_df.rename(columns={"lng":"lon"},inplace=True)


    #apply(get_country_abbreviation) will pass each country name to the get_country_abbreviation function, which will return the 2-letter country code (if available).
    city_country_df["country_abbriev"] = city_country_df["country"].apply(get_country_abbreviation)
    # USA, UK
    city_country_df["country_abbriev"] = city_country_df["country_abbriev"].replace("US","USA") 
    city_country_df["country_abbriev"] = city_country_df["country_abbriev"].replace("GB","UK")      

    city_country_df.to_csv("city_country.csv")
    return city_country_df

In [26]:

# Function to handle form submission
def submit(city_country_df):
    city = city_entry.get()
    country = country_entry.get()
    destination = f"({city}, {country})"
    
    try:
       # validate city 
        if city.lower() not in city_country_df["city"].str.lower().to_list():
            raise ValueError("City not found! Please input a valid city!")
        
        # validate country
        country_match = (country.lower() in city_country_df["country"].str.lower().to_list()) or (country.upper() in city_country_df["country_abbriev"].str.upper().to_list())
        if not country_match:
            raise ValueError("Country not found! Please input a valid country!")
        
        # validate the combination of city and country
        # Filter for the specific city and country
        fil_location = (
        (city_country_df["city"].str.lower() == city.lower()) & 
        (
            (city_country_df["country"].str.lower() == country.lower()) | 
            (city_country_df["country_abbriev"].str.upper() == country.upper())
         )
        )

        # Extract latitude and longitude as scalar values
        if not fil_location.any():  # Check if the city-country combination exists
            raise ValueError(f"No data found for the location: {city}, {country}!")
        else:
            location_data = city_country_df.loc[fil_location].iloc[0]
            lat = location_data["lat"]  # Extract value
            lon = location_data["lon"]  # Extract value
            city = location_data["city"]
            country = location_data["country"]
    
        
        # validate date
        current_date = datetime.now()
        check_in_date_str = check_in_entry.get()
        check_out_date_str = check_out_entry.get()

        check_in_date = datetime.strptime(check_in_date_str, "%Y-%m-%d")
        check_out_date = datetime.strptime(check_out_date_str, "%Y-%m-%d")
        
        # Validate check-in date
        if check_in_date.date() < current_date.date():
            raise ValueError("Check-in date cannot be earlier than today. Please try again!")

        # Validate dates
        if check_in_date.date() >= check_out_date.date():
            raise ValueError("Check-out date must be later than check-in date. Please try again!")
        
        # Number of adults and children
        adult_num = int(adults_entry.get())
        if adult_num <= 0:
            raise ValueError("Number of adults must be at least 1!")
    
        
        children_num = int(children_entry.get())
        if children_num < 0:
            raise ValueError("Number of adults must be non-negative!")
        
        room_num = int(room_entry.get())
        if room_num <= 0:
            raise ValueError("Number of adults must be at least 1!")
        
        # Show confirmation message
        messagebox.showinfo("Booking Details are as follows:\n", 
                            f"The valid city: {city}!\n"
                            f"The valid country: {country}!\n"
                            f"The destination: {destination} --> Latitude and longitude: ({lat}, {lon})\n"
                            f"The check-in date: {check_in_date.date()}\n"
                            f"The check-out date: {check_out_date.date()}\n"
                            f"The number of adults: {adult_num}\n"
                            f"The number of children: {children_num}\n"
                            f"The number of rooms is: {room_num}\n")
        global location,period,num
        location = (city,country,lat,lon)
        period = (check_in_date,check_out_date)
        num = (adult_num,room_num,children_num)

        # Automatically shut down the GUI
        root.destroy()  # Close the GUI window
    
    except ValueError as e:
        messagebox.showerror("Input Error", str(e))



#period = (check_in_date,check_out_date)
#num = (adult_num,room_num,children_num)

In [22]:
# now start to change the variable in url

def get_url(location,period,num):
    # room location
    city = location[0]
    country = location[1]
    
    # room time
    check_in_date = period[0]
    check_out_date = period[1]
 
    # room num
    adult_num = num[0]
    romm_num = num[1]
    children_num = num[2]

    url = f"https://www.booking.com/searchresults.en-gb.html?ss={city}%2C+Community+of+{city}%2C+{country}&efdco=1&label=gen173bo-1DCAEoggI46AdICVgDaMgBiAEBmAEJuAEXyAEM2AED6AEB-AEDiAIBmAICqAIDuAKNxYG6BsACAdICJDc4YzI3MmQ4LWM1NzMtNDE4MS04MTk0LTZlMTdkZWNjMWQwZdgCBOACAQ&sid=badcbe9c6b390c4f5668ded61093febe&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=index&dest_id=-390625&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=d4b24c866c100605&ac_meta=GhBkNGIyNGM4NjZjMTAwNjA1IAAoATICZW46BE1hZHJAAEoAUAA%3D&checkin={check_in_date}&checkout={check_out_date}&group_adults={adult_num}&no_rooms={romm_num}&group_children={children_num}&order=price"

    return url


def get_container(url,headers):
    # request to the website
    response = requests.get(url,headers=headers)
    html= response.content
    soup = BeautifulSoup(html)

    # choose container
    hotel_container = soup.select("div > div.c624d7469d.a0e60936ad.a3214e5942.b0db0e8ada")

    container = []

    # filter out ad
    for con in hotel_container:
        # Check for ad markers
        if con.find("span", class_="b30f8eb2d6", string="Ad"):
            print("Ad found, skipping...")
            continue
        container.append(con)

    return container


In [23]:
# loop container to extract hote name, price, rating, vote

# extract hote data
def selector_extract(item, selector):
    parse = item.select(selector)
    if parse:
        return parse[0].text
    else:
        return np.nan
    

def get_selector():
    selector = []
    
    name_selector = "a > div.f6431b446c.a15b38c233"
    selector.append(name_selector)

    price_selector = "div > span.f6431b446c.fbfd7c1165.e84eb96b1f"
    selector.append(price_selector)

    rating_selector = "div > div.ac4a7896c7"
    selector.append(rating_selector)

    vote_selector = "div > div.abf093bdfe.f45d8e4c32.d935416c47"
    selector.append(vote_selector)

    distance_selector = 'span > span[data-testid="distance"]'
    selector.append(distance_selector)

    return selector


def extract_hotel_web(container,selector,location):
    # selector 
    name_selector =  selector[0]
    price_selector =  selector[1]
    rating_selector =  selector[2]
    vote_selector =  selector[3]
    distance_selector =  selector[4]
   
    name_lst =[]
    price_lst = []
    rating_lst = []
    vote_lst = []
    distance_lst = []

    #loop container
    for con in container:
        #name
        name = selector_extract(con, name_selector)
        name_lst.append(name)

        #price
        price = selector_extract(con, price_selector)
        price_lst.append(price)

        #rating
        rating = selector_extract(con, rating_selector)
        vote = selector_extract(con, vote_selector)
        if (not "Scored" in rating) and (con.find("span", class_="b30f8eb2d6", string="New to Booking.com")):
            rating = np.nan
            vote = np.nan
        rating_lst.append(rating)
        vote_lst.append(vote)

        #distance
        distance = selector_extract(con, distance_selector)
        distance_lst.append(distance)

#    print(name_lst)
#    print(price_lst)
#    print(rating_lst)
#    print(vote_lst)
#    print(distance_lst)

    dic_hotel = {}

    city = location[0]
    country = location[1]

    dic_hotel = {"city":city,"country":country,"name":name_lst,"price":price_lst,"rating":rating_lst,"vote":vote_lst,"distance":distance_lst}

    return dic_hotel



In [27]:
# Main functon:


#-----------------------------------------------------------------------GUI--------------------------------------------
# Set up the main window
root = tk.Tk()
root.title("Travel Booking Form")
root.geometry("700x600")
root.config(bg="#f5f5f5")

# Title
title_label = tk.Label(root, text="Travel Booking Form", font=("Arial", 16, "bold"), bg="#f5f5f5")
title_label.pack(pady=10)

# City Entry
city_label = tk.Label(root, text="City", bg="#f5f5f5")
city_label.pack()
city_entry = tk.Entry(root)
city_entry.pack(pady=5)

# Country Entry
country_label = tk.Label(root, text="Country", bg="#f5f5f5")
country_label.pack()
country_entry = tk.Entry(root)
country_entry.pack(pady=5)

# Check-in Date Entry
check_in_label = tk.Label(root, text="Check-in Date (YYYY-MM-DD)", bg="#f5f5f5")
check_in_label.pack()
check_in_entry = tk.Entry(root)
check_in_entry.pack(pady=5)

# Check-out Date Entry
check_out_label = tk.Label(root, text="Check-out Date (YYYY-MM-DD)", bg="#f5f5f5")
check_out_label.pack()
check_out_entry = tk.Entry(root)
check_out_entry.pack(pady=5)

# Adults Entry
adults_label = tk.Label(root, text="Number of Adults", bg="#f5f5f5")
adults_label.pack()
adults_entry = tk.Entry(root)
adults_entry.pack(pady=5)

# Children Entry
children_label = tk.Label(root, text="Number of Children", bg="#f5f5f5")
children_label.pack()
children_entry = tk.Entry(root)
children_entry.pack(pady=5)

# Children Entry
room_label = tk.Label(root, text="Number of rooms", bg="#f5f5f5")
room_label.pack()
room_entry = tk.Entry(root)
room_entry.pack(pady=5)

city_country_df = city_country_dataframe()

# Declare global variables at the top (before the functions)
location = None
period = None
num = None


# Submit Button
submit_button = tk.Button(root, text="Submit", bg="#4CAF50", fg="white", command=lambda:submit(city_country_df))
submit_button.pack(pady=20)

# Run the GUI
root.mainloop()


# -----------------------------------------Extract hotel data-------------------
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

selector = []
selector = get_selector()


url = get_url(location,period,num)
print(f"The url of booking website is: {url}\n")

container = get_container(url,headers)

dic_hotel = extract_hotel_web(container,selector,location)

df = pd.DataFrame(dic_hotel)

# Save the DataFrame to a CSV file
df.to_csv("hotel_data_country.csv", index=False)
print("The hotel dataframe is: \n")
df

The url of booking website is: https://www.booking.com/searchresults.en-gb.html?ss=Madrid%2C+Community+of+Madrid%2C+Spain&efdco=1&label=gen173bo-1DCAEoggI46AdICVgDaMgBiAEBmAEJuAEXyAEM2AED6AEB-AEDiAIBmAICqAIDuAKNxYG6BsACAdICJDc4YzI3MmQ4LWM1NzMtNDE4MS04MTk0LTZlMTdkZWNjMWQwZdgCBOACAQ&sid=badcbe9c6b390c4f5668ded61093febe&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=index&dest_id=-390625&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=d4b24c866c100605&ac_meta=GhBkNGIyNGM4NjZjMTAwNjA1IAAoATICZW46BE1hZHJAAEoAUAA%3D&checkin=2024-11-26 00:00:00&checkout=2024-12-12 00:00:00&group_adults=1&no_rooms=1&group_children=0&order=price

The hotel dataframe is: 



Unnamed: 0,city,country,name,price,rating,vote,distance
0,Madrid,Spain,Hostel Thirty One,"SEK 3,543",Scored 5.3,"2,188 reviews",3.9 km from centre
1,Madrid,Spain,Hostel SOFÍA,"SEK 3,681",Scored 4.8,601 reviews,6.1 km from centre
2,Madrid,Spain,Hostel sofía,"SEK 3,773",Scored 5.5,"2,504 reviews",6.1 km from centre
3,Madrid,Spain,Hostel OASIS,"SEK 3,819",Scored 6.4,131 reviews,4.3 km from centre
4,Madrid,Spain,Hostel Vida Verde,"SEK 4,613",Scored 5.6,201 reviews,7.7 km from centre
5,Madrid,Spain,Hostel Estrella 20,"SEK 4,808",Scored 7.8,196 reviews,0.7 km from centre
6,Madrid,Spain,Fit Hostel Madrid,"SEK 4,813",Scored 5.7,203 reviews,1.4 km from centre
7,Madrid,Spain,Hostel MYD La Latina,"SEK 4,913",Scored 6.2,"2,674 reviews",1.1 km from centre
8,Madrid,Spain,SabiaNatura - boutiqueMadrid,"SEK 5,227",Scored 7.9,"1,804 reviews",400 m from centre
9,Madrid,Spain,Arc House Pop Art,"SEK 5,234",Scored 6.3,546 reviews,1.1 km from centre


In [None]:
df.dropna(inplace=True)
df

In [None]:
# test code:

name = container[0].select("a > div.f6431b446c.a15b38c233")[0].text
price = container[0].select("div > span.f6431b446c.fbfd7c1165.e84eb96b1f")[0].text
rating = container[0].select("div > div.ac4a7896c7")[0].text
vote = container[0].select("div > div.abf093bdfe.f45d8e4c32.d935416c47")[0].text
distance  = container[0].select('span > span[data-testid="distance"]')[0].text


print(name)
print(price)
print(rating)
print(vote)
print(distance)


name = container[1].select("a > div.f6431b446c.a15b38c233")[0].text
price = container[1].select("div > span.f6431b446c.fbfd7c1165.e84eb96b1f")[0].text
rating = container[1].select("div > div.ac4a7896c7")[0].text
vote = container[1].select("div > div.abf093bdfe.f45d8e4c32.d935416c47")[0].text
distance  = container[1].select('span > span[data-testid="distance"]')[0].text


print(name)
print(price)
print(rating)
print(vote)
print(distance)


name = container[2].select("a > div.f6431b446c.a15b38c233")[0].text
price = container[2].select("div > span.f6431b446c.fbfd7c1165.e84eb96b1f")[0].text
rating = container[2].select("div > div.ac4a7896c7")[0].text
vote = container[2].select("div > div.abf093bdfe.f45d8e4c32.d935416c47")[0].text
distance  = container[2].select('span > span[data-testid="distance"]')[0].text


print(name)
print(price)
print(rating)
print(vote)
print(distance)