In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import time
import random

# SCRAPING + REGEX CLEANING

hotel_name = []
place = []
reviews_rating = []
exp_rating = []
price = []
dist_city_center = []

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

for page in range(1, 24):  # ~220+ hotels
    url = f"https://traveltriangle.com/hotels/bali?page={page}"
    print(f"Scraping page {page}: {url}")
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        print(f"Skipping page {page}, status {res.status_code}")
        continue

    soup = BeautifulSoup(res.text, "lxml")
    cards = soup.find_all("div", class_="clearfix p8 relative wfull min-width-0")

    if not cards:
        print("No hotel cards found, stopping.")
        break

    for card in cards:
        # Hotel Name
        h = card.find("a", class_=re.compile("at_hotelName"))
        name = h.text.strip() if h else np.nan
        name = re.sub(r"[\n\t\r]+", " ", name).strip()
        hotel_name.append(name)

        # Place
        p = card.find("div", class_="m0 f12")
        plc = p.text.strip() if p else np.nan
        plc = re.sub(r"\|.*", "", plc)               # remove “|View Map”
        plc = re.sub(r"[\n\t\r]+", " ", plc).strip()
        place.append(plc)

        # Price
        pr = card.find("p", class_="f16 fw7 m0 sfc3")
        price_text = pr.text.strip() if pr else np.nan
        # Fix encoding issues (â¹ → ₹)
        try:
            price_text = price_text.encode("latin1").decode("utf-8")
        except:
            pass

        # Remove unwanted characters and newlines
        price_text = re.sub(r"[\n\t\r]+", " ", price_text)
        price_text = re.sub(r"\s{2,}", " ", price_text)
        price_text = re.sub(r"[^\d₹–\- ]", "", price_text).strip()
        price.append(price_text)
       
        # Reviews Rating (8.7)
        rv = card.find("span", class_="f14 fw9 sfcw")
        review = rv.text.strip() if rv else np.nan
        if review and re.search(r"\d+\.\d+", review):
            review = re.findall(r"\d+\.\d+", review)[0]
        reviews_rating.append(review)

        # Experience Rating (“Very Good”)
        exp = card.find("span", class_="text-capitalize")
        exp_text = exp.text.strip().title() if exp else np.nan
        exp_text = re.sub(r"[\n\t\r]+", " ", exp_text).strip()
        exp_rating.append(exp_text)

        # Distance from City Center
        dist = card.find("p", string=re.compile("km from City Center"))
        dist_text = dist.text.strip() if dist else np.nan
        dist_text = re.sub(r"[\n\t\r]+", " ", str(dist_text))
        dist_text = re.sub(r"\s{2,}", " ", dist_text).strip()
        dist_city_center.append(dist_text)

    print(f"Total hotels so far: {len(hotel_name)}")
    time.sleep(random.uniform(1.5, 3.0))  # polite delay

# SAVE CLEANED DATA

bali_hotels = {
    "Hotel_Name": hotel_name,
    "Place": place,
    "Reviews_Rating": reviews_rating,
    "Exp_Rating": exp_rating,
    "Price": price,
    "Distance_City_Center": dist_city_center
}



Scraping page 1: https://traveltriangle.com/hotels/bali?page=1
Total hotels so far: 10
Scraping page 2: https://traveltriangle.com/hotels/bali?page=2
Total hotels so far: 20
Scraping page 3: https://traveltriangle.com/hotels/bali?page=3
Total hotels so far: 30
Scraping page 4: https://traveltriangle.com/hotels/bali?page=4
Total hotels so far: 40
Scraping page 5: https://traveltriangle.com/hotels/bali?page=5
Total hotels so far: 50
Scraping page 6: https://traveltriangle.com/hotels/bali?page=6
Total hotels so far: 60
Scraping page 7: https://traveltriangle.com/hotels/bali?page=7
Total hotels so far: 70
Scraping page 8: https://traveltriangle.com/hotels/bali?page=8
Total hotels so far: 80
Scraping page 9: https://traveltriangle.com/hotels/bali?page=9
Total hotels so far: 90
Scraping page 10: https://traveltriangle.com/hotels/bali?page=10
Total hotels so far: 100
Scraping page 11: https://traveltriangle.com/hotels/bali?page=11
Total hotels so far: 110
Scraping page 12: https://traveltrian

In [2]:
bali_hotels_data = pd.DataFrame(bali_hotels)

In [3]:
bali_hotels_data

Unnamed: 0,Hotel_Name,Place,Reviews_Rating,Exp_Rating,Price,Distance_City_Center
0,Ize Seminyak,Seminyak,8.7,Very Good,₹ 5000 - ₹ 7500,1 km from City Center
1,Kuta Paradiso Hotel,Kuta,8.6,Very Good,₹ 5000 - ₹ 7500,3 km from City Center
2,Courtyard By Marriott Bali Seminyak Resort,Seminyak,9.9,Excellent,₹ 10000,1 km from City Center
3,Montigo Resorts Seminyak,Seminyak,9.7,Excellent,₹ 10000,2 km from City Center
4,Warwick Ibah Luxury Villas & Spa Ubud,Ubud,9.6,Excellent,₹ 10000,2 km from City Center
...,...,...,...,...,...,...
221,Bambu Indah Resort,Ubud,7.6,Good,₹ 5000 - ₹ 7500,1 km from City Center
222,Alam Puri Art Museum Villa Denpasar,Denpasar,7.5,Good,₹ 2500 - ₹ 5000,5 km from City Center
223,Pink Coco Bali,Uluwatu,7.3,Good,₹ 5000 - ₹ 7500,
224,The Rani Hotel & Spa Kutta,Kuta,3.0,Review Score,₹ 1000 - ₹ 2500,2 km from City Center


In [10]:
bali_hotels_data.to_csv(r"C:\Users\ompra\Downloads\bali_hotels_data.csv", index=False, encoding="utf-8-sig")
print(" Saved directly to Downloads as a CSV file!")

✅ Saved directly to Downloads as a CSV file!
