In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import pycountry
from datetime import datetime

In [None]:
current_date = datetime.now()
current_date.date()

In [None]:
# Construct City_country dataframe

# Create a mapping function for country abbreviations
def get_country_abbreviation(country_name):
    try:
         # use pycountry to get the country abbreviation
        country =  pycountry.countries.lookup(country_name)
        return country.alpha_2
    except LookupError:
         # If not found, return NaN
            # If not found, return NaN and print the country name
        #print(f"Country not found: {country_name}")
        return np.nan

# Get city_country dataframe
def city_country_dataframe():
    # load city, country dataframe
    city_country_df = pd.read_csv("worldcities.csv")

    # only select four colums:
    city_country_df = city_country_df[["city","lat","lng","country"]]

    # renmae column name
    city_country_df.rename(columns={"lng":"lon"},inplace=True)


    #apply(get_country_abbreviation) will pass each country name to the get_country_abbreviation function, which will return the 2-letter country code (if available).
    city_country_df["country_abbriev"] = city_country_df["country"].apply(get_country_abbreviation)
    # USA, UK
    city_country_df["country_abbriev"] = city_country_df["country_abbriev"].replace("US","USA") 
    city_country_df["country_abbriev"] = city_country_df["country_abbriev"].replace("GB","UK")      

    city_country_df.to_csv("city_country.csv")
    return city_country_df


# Input period and numbers of room
def input_check_room():
   
    current_date = datetime.now()

    # Get a valid check-in date
    while True:
        try:
            check_in_date = input("Please input the check-in date (YYYY-MM-DD): ")
            # used datetime.strptime to parse and validate check-in input
            check_in_date_obj = datetime.strptime(check_in_date,"%Y-%m-%d")
            # judge if check_in_date is latter than today
            # converted input strings to datetime objects for proper comparison
            if check_in_date_obj.date() < current_date.date():
                print("Check-in date cannot be earlier than today. Please try again")
            else:
                print(f"The check-in date is: {check_in_date_obj.date()}")
                break
        # judege if the check-in date is valid format or not
        except ValueError:
            print("Invalid date format. Please enter YYYY-MM-DD!")


    # Get a valid check-out date
    while True:
        try:
            check_out_date= input("Please input the check-out date (YYYY-MM-DD):")
            check_out_date_obj = datetime.strptime(check_out_date,"%Y-%m-%d")

            if (check_out_date_obj.date() <= check_in_date_obj.date()):
                print("Check-out date must be later than the check-in date! Please input check-out date again!")
            else:
                print(f"The check-out date is: {check_out_date_obj.date()}")
                break
        # judege if the check-in date is valid format or not
        except ValueError:
            print("Invalid date format. Please enter YYYY-MM-DD1")    

    
    # Get valid adult_num
    while True:
        try:
            adult_num = int(input("How many adults? "))
            if adult_num <= 0:
                print("Number of adults must be at least 1!")
            else:
                print(f"The number of adults is: {adult_num}")
                break
        # judege if the check-in date is valid format or not
        except ValueError:
            print("Invalid adult number. Please enter a positive integer1") 


    # Get valid room_num
    while True:
        try:
            room_num = int(input("How many rooms? "))
            if room_num <= 0:
                print("Number of rooms must be at least 1!")
            else:
                print(f"The number of room is: {room_num}")
                break
        # judege if the check-in date is valid format or not
        except ValueError:
            print("Invalid adult number. Please enter a positive integer!") 


    # Get valid children_num
    while True:
        try:
            children_num = int(input("How many chilrdern? "))
            if children_num < 0:
                print("Number of children cannot be negative!")
            else:
                print(f"The number of children is: {children_num}")
                break
                # judege if the check-in date is valid format or not
        except ValueError:
            print("Invalid children number. Please enter a non-negative integer!") 


    period = (check_in_date,check_out_date)
    num = (adult_num,room_num,children_num)

    return period,num

In [25]:
# now start to change the variable in url

def get_url(location,period,num):
    # room location
    city = location[0]
    country = location[1]
    
    # room time
    check_in_date = period[0]
    check_out_date = period[1]
 
    # room num
    adult_num = num[0]
    romm_num = num[1]
    children_num = num[2]

    url = f"https://www.booking.com/searchresults.en-gb.html?ss={city}%2C+Community+of+{city}%2C+{country}&efdco=1&label=gen173bo-1DCAEoggI46AdICVgDaMgBiAEBmAEJuAEXyAEM2AED6AEB-AEDiAIBmAICqAIDuAKNxYG6BsACAdICJDc4YzI3MmQ4LWM1NzMtNDE4MS04MTk0LTZlMTdkZWNjMWQwZdgCBOACAQ&sid=badcbe9c6b390c4f5668ded61093febe&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=index&dest_id=-390625&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=d4b24c866c100605&ac_meta=GhBkNGIyNGM4NjZjMTAwNjA1IAAoATICZW46BE1hZHJAAEoAUAA%3D&checkin={check_in_date}&checkout={check_out_date}&group_adults={adult_num}&no_rooms={romm_num}&group_children={children_num}&order=price"

    return url


def get_container(url,headers):
    # request to the website
    response = requests.get(url,headers=headers)
    html= response.content
    soup = BeautifulSoup(html)

    # choose container
    hotel_container = soup.select("div > div.c624d7469d.a0e60936ad.a3214e5942.b0db0e8ada")

    container = []

    # filter out ad
    for con in hotel_container:
        # Check for ad markers
        if con.find("span", class_="b30f8eb2d6", string="Ad"):
            print("Ad found, skipping...")
            continue
        container.append(con)

    return container


In [26]:
# loop container to extract hote name, price, rating, vote

# extract hote data
def selector_extract(item, selector):
    parse = item.select(selector)
    if parse:
        return parse[0].text
    else:
        return np.nan
    

def get_selector():
    selector = []
    
    name_selector = "a > div.f6431b446c.a15b38c233"
    selector.append(name_selector)

    price_selector = "div > span.f6431b446c.fbfd7c1165.e84eb96b1f"
    selector.append(price_selector)

    rating_selector = "div > div.ac4a7896c7"
    selector.append(rating_selector)

    vote_selector = "div > div.abf093bdfe.f45d8e4c32.d935416c47"
    selector.append(vote_selector)

    distance_selector = 'span > span[data-testid="distance"]'
    selector.append(distance_selector)

    return selector


def extract_hotel_web(container,selector,location):
    # selector 
    name_selector =  selector[0]
    price_selector =  selector[1]
    rating_selector =  selector[2]
    vote_selector =  selector[3]
    distance_selector =  selector[4]
   
    name_lst =[]
    price_lst = []
    rating_lst = []
    vote_lst = []
    distance_lst = []

    #loop container
    for con in container:
        #name
        name = selector_extract(con, name_selector)
        name_lst.append(name)

        #price
        price = selector_extract(con, price_selector)
        price_lst.append(price)

        #rating
        rating = selector_extract(con, rating_selector)
        vote = selector_extract(con, vote_selector)
        if (not "Scored" in rating) and (con.find("span", class_="b30f8eb2d6", string="New to Booking.com")):
            rating = np.nan
            vote = np.nan
        rating_lst.append(rating)
        vote_lst.append(vote)

        #distance
        distance = selector_extract(con, distance_selector)
        distance_lst.append(distance)

#    print(name_lst)
#    print(price_lst)
#    print(rating_lst)
#    print(vote_lst)
#    print(distance_lst)

    dic_hotel = {}

    city = location[0]
    country = location[1]

    dic_hotel = {"city":city,"country":country,"name":name_lst,"price":price_lst,"rating":rating_lst,"vote":vote_lst,"distance":distance_lst}

    return dic_hotel



In [27]:
# Main functon:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

selector = []
selector = get_selector()

# load city, country dataframe

# get city_country_df
city_country_df = city_country_dataframe()
# input city, country
location = input_city_country(city_country_df)
period,num = input_check_room()

url = get_url(location,period,num)
print(f"The url of booking website is: {url}\n")

container = get_container(url,headers)

dic_hotel = extract_hotel_web(container,selector,location)

df = pd.DataFrame(dic_hotel)

# Save the DataFrame to a CSV file
df.to_csv("hotel_data_country.csv", index=False)
print("The hotel dataframe is: \n")
df

City not found! Please input a valid city!
The valid city is: Madrid!
Country not found! Please input a valid country!
Country not found! Please input a valid country!
The valid city is: Spain!
The destination is: (Madrid, Spain) --> Latitude and longitude: (40.4169,-3.7033)
The check-in date is: 2024-11-26
The check-out date is: 2024-12-12
The number of adults is: 1
The number of room is: 1
The number of children is: 0
The url of booking website is: https://www.booking.com/searchresults.en-gb.html?ss=Madrid%2C+Community+of+Madrid%2C+Spain&efdco=1&label=gen173bo-1DCAEoggI46AdICVgDaMgBiAEBmAEJuAEXyAEM2AED6AEB-AEDiAIBmAICqAIDuAKNxYG6BsACAdICJDc4YzI3MmQ4LWM1NzMtNDE4MS04MTk0LTZlMTdkZWNjMWQwZdgCBOACAQ&sid=badcbe9c6b390c4f5668ded61093febe&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=index&dest_id=-390625&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=d4b24c866c100605&ac_meta=GhBkNGIyNGM4NjZjMTAwNjA1IAAoATICZW46BE1

Unnamed: 0,city,country,name,price,rating,vote,distance
0,Madrid,Spain,Hostel Thirty One,"SEK 3,543",Scored 5.3,"2,188 reviews",3.9 km from centre
1,Madrid,Spain,Hostel SOFÍA,"SEK 3,681",Scored 4.8,601 reviews,6.1 km from centre
2,Madrid,Spain,Hostel sofía,"SEK 3,773",Scored 5.5,"2,504 reviews",6.1 km from centre
3,Madrid,Spain,Hostel OASIS,"SEK 3,819",Scored 6.4,131 reviews,4.3 km from centre
4,Madrid,Spain,Hostel Vida Verde,"SEK 4,613",Scored 5.6,201 reviews,7.7 km from centre
5,Madrid,Spain,Hostel Estrella 20,"SEK 4,808",Scored 7.8,196 reviews,0.7 km from centre
6,Madrid,Spain,Fit Hostel Madrid,"SEK 4,919",Scored 5.7,202 reviews,1.4 km from centre
7,Madrid,Spain,Hostel MYD La Latina,"SEK 4,939",Scored 6.2,"2,674 reviews",1.1 km from centre
8,Madrid,Spain,SabiaNatura - boutiqueMadrid,"SEK 5,227",Scored 7.9,"1,803 reviews",400 m from centre
9,Madrid,Spain,Arc House Pop Art,"SEK 5,234",Scored 6.3,546 reviews,1.1 km from centre


In [None]:
df.dropna(inplace=True)
df

In [None]:
# test code:

name = container[0].select("a > div.f6431b446c.a15b38c233")[0].text
price = container[0].select("div > span.f6431b446c.fbfd7c1165.e84eb96b1f")[0].text
rating = container[0].select("div > div.ac4a7896c7")[0].text
vote = container[0].select("div > div.abf093bdfe.f45d8e4c32.d935416c47")[0].text
distance  = container[0].select('span > span[data-testid="distance"]')[0].text


print(name)
print(price)
print(rating)
print(vote)
print(distance)


name = container[1].select("a > div.f6431b446c.a15b38c233")[0].text
price = container[1].select("div > span.f6431b446c.fbfd7c1165.e84eb96b1f")[0].text
rating = container[1].select("div > div.ac4a7896c7")[0].text
vote = container[1].select("div > div.abf093bdfe.f45d8e4c32.d935416c47")[0].text
distance  = container[1].select('span > span[data-testid="distance"]')[0].text


print(name)
print(price)
print(rating)
print(vote)
print(distance)


name = container[2].select("a > div.f6431b446c.a15b38c233")[0].text
price = container[2].select("div > span.f6431b446c.fbfd7c1165.e84eb96b1f")[0].text
rating = container[2].select("div > div.ac4a7896c7")[0].text
vote = container[2].select("div > div.abf093bdfe.f45d8e4c32.d935416c47")[0].text
distance  = container[2].select('span > span[data-testid="distance"]')[0].text


print(name)
print(price)
print(rating)
print(vote)
print(distance)

In [2]:
import pandas as pd

city_country_df=pd.Series(["London","Madrid"])

city = input("Please input the city: ").replace(" ","")
if city.lower() not in city_country_df.str.lower().to_list():
    print("City not found! Please input a valid city!")
else:
    print("Correct city！")

Correct city！


In [4]:
city_country_df=pd.Series(["UK","Spain"])
while True:
    country = input("Please input the country (or abbreviation): ").replace(" ","")
    country_match = (country.lower() in city_country_df.str.lower().to_list()) | (country.upper() in city_country_df.str.upper().to_list())
    if not country_match:
        print("Country not found! Please input a valid country!")
    else:   
        print("Correct country!")
        break
    

Country not found! Please input a valid country!
Country not found! Please input a valid country!
Correct country!


In [18]:

city_country_df = pd.Series(["UK","Spain"])

while True:
        # Get valid country
    country = input("Please input the country (or abbreviation): ").replace(" ","")
    country_match = (country.lower() in city_country_df.str.lower().to_list()) | (country.upper() in city_country_df.str.upper().to_list())
    if not country_match:
        print("Country not found! Please input a valid country!")
    else:
        print("Correct country!")
        break

Country not found! Please input a valid country!
Country not found! Please input a valid country!
Country not found! Please input a valid country!
Correct country!
