<a href="https://colab.research.google.com/github/Shazizan/portfolio/blob/master/generate_dataset_pyspark_hotel_booking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PySpark Code to Generate Hotel Booking Dataset (Medium Size)**

## **1. Install & Import**

In [1]:
!pip install faker
from faker import Faker
import random
from datetime import timedelta
from pyspark.sql import SparkSession, functions as F, types as T

fake = Faker()
spark = SparkSession.builder.appName("HotelBookingDataset").getOrCreate()

Collecting faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-38.2.0


## **2. Generate Customers Table**

In [2]:
customer_rows = []
for i in range(500):
    customer_rows.append((
        i+1,
        fake.name(),
        fake.email(),
        fake.country(),
        random.choice(["Regular", "Silver", "Gold"]),
        fake.date_between(start_date="-3y", end_date="today")
    ))

customer_schema = T.StructType([
    T.StructField("customer_id", T.IntegerType(), False),
    T.StructField("full_name", T.StringType(), False),
    T.StructField("email", T.StringType(), False),
    T.StructField("country", T.StringType(), False),
    T.StructField("vip_status", T.StringType(), False),
    T.StructField("signup_date", T.DateType(), False),
])

customers_df = spark.createDataFrame(customer_rows, customer_schema)

## **3. Generate Hotels Table**

In [3]:
hotel_rows = []
cities = ["Tokyo", "Osaka", "London", "New York", "Paris", "Kuala Lumpur", "Dubai", "Singapore", "Bangkok", "Seoul"]

for i in range(10):
    hotel_rows.append((
        i+1,
        f"Hotel {fake.last_name()}",
        cities[i],
        random.randint(3, 5),
        random.randint(100, 400)
    ))

hotel_schema = T.StructType([
    T.StructField("hotel_id", T.IntegerType(), False),
    T.StructField("hotel_name", T.StringType(), False),
    T.StructField("city", T.StringType(), False),
    T.StructField("star_rating", T.IntegerType(), False),
    T.StructField("total_rooms", T.IntegerType(), False),
])

hotels_df = spark.createDataFrame(hotel_rows, hotel_schema)

## **4. Generate Rooms Table**

In [4]:
room_rows = []
room_types = ["Single", "Double", "Suite"]

room_id = 1
for h in range(1, 11):
    for r in range(20):  # 20 rooms per hotel = 200 rooms total
        room_rows.append((
            room_id,
            h,
            random.choice(room_types),
            random.randint(150, 1000),
            random.choice([True, False])
        ))
        room_id += 1

room_schema = T.StructType([
    T.StructField("room_id", T.IntegerType(), False),
    T.StructField("hotel_id", T.IntegerType(), False),
    T.StructField("room_type", T.StringType(), False),
    T.StructField("price_per_night", T.IntegerType(), False),
    T.StructField("is_sea_view", T.BooleanType(), False),
])

rooms_df = spark.createDataFrame(room_rows, room_schema)

## **5. Generate Bookings Table**

Logic
- Random check-in date within last 2 years
- Stay between 1–7 nights
- Total price = nights × room rate
- 15% bookings cancelled

In [5]:
booking_rows = []
for i in range(3000):
    customer_id = random.randint(1, 500)
    room_id = random.randint(1, 200)

    checkin = fake.date_between(start_date="-2y", end_date="today")
    nights = random.randint(1, 7)
    checkout = checkin + timedelta(days=nights)

    price = rooms_df.filter(F.col("room_id") == room_id).select("price_per_night").first()[0]

    total_price = price * nights

    booking_rows.append((
        i+1,
        customer_id,
        room_id,
        fake.date_between(start_date=checkin - timedelta(days=30), end_date=checkin),
        checkin,
        checkout,
        nights,
        float(total_price),
        random.choice(["Paid", "Pending", "Failed"]),
        random.random() < 0.15  # 15% cancellation
    ))

booking_schema = T.StructType([
    T.StructField("booking_id", T.IntegerType(), False),
    T.StructField("customer_id", T.IntegerType(), False),
    T.StructField("room_id", T.IntegerType(), False),
    T.StructField("booking_date", T.DateType(), False),
    T.StructField("checkin_date", T.DateType(), False),
    T.StructField("checkout_date", T.DateType(), False),
    T.StructField("num_nights", T.IntegerType(), False),
    T.StructField("total_price", T.DoubleType(), False),
    T.StructField("payment_status", T.StringType(), False),
    T.StructField("is_cancelled", T.BooleanType(), False),
])

bookings_df = spark.createDataFrame(booking_rows, booking_schema)

## **6. Save Data in Multiple Formats**

Save as CSV

In [None]:
#customers_df.write.mode("overwrite").csv("/content/hotel/customers_csv/", header=True)
#hotels_df.write.mode("overwrite").csv("/content/hotel/hotels_csv/", header=True)
#rooms_df.write.mode("overwrite").csv("/content/hotel/rooms_csv/", header=True)
#bookings_df.write.mode("overwrite").csv("/content/hotel/bookings_csv/", header=True)

Save as JSON

In [6]:
customers_df.write.mode("overwrite").json("/content/hotel/customers_json/")
hotels_df.write.mode("overwrite").json("/content/hotel/hotels_json/")
rooms_df.write.mode("overwrite").json("/content/hotel/rooms_json/")
bookings_df.write.mode("overwrite").json("/content/hotel/bookings_json/")

Save as Parquet

In [7]:
customers_df.write.mode("overwrite").parquet("/content/hotel/customers_parquet/")
hotels_df.write.mode("overwrite").parquet("/content/hotel/hotels_parquet/")
rooms_df.write.mode("overwrite").parquet("/content/hotel/rooms_parquet/")
bookings_df.write.mode("overwrite").parquet("/content/hotel/bookings_parquet/")

Save as Delta

initially, I got error while running this code.

In [None]:
#customers_df.write.format("delta").mode("overwrite").save("/content/hotel/customers_delta/")
#hotels_df.write.format("delta").mode("overwrite").save("/content/hotel/hotels_delta/")
#rooms_df.write.format("delta").mode("overwrite").save("/content/hotel/rooms_delta/")
#bookings_df.write.format("delta").mode("overwrite").save("/content/hotel/bookings_delta/")

## **Download JSON folders**

In [9]:
from google.colab import files
!zip -r customers_json.zip /content/hotel/customers_json/
files.download("customers_json.zip")

  adding: content/hotel/customers_json/ (stored 0%)
  adding: content/hotel/customers_json/part-00000-8fcce1f1-38f2-442f-8a10-611b640bbe7c-c000.json (deflated 80%)
  adding: content/hotel/customers_json/.part-00001-8fcce1f1-38f2-442f-8a10-611b640bbe7c-c000.json.crc (stored 0%)
  adding: content/hotel/customers_json/part-00001-8fcce1f1-38f2-442f-8a10-611b640bbe7c-c000.json (deflated 80%)
  adding: content/hotel/customers_json/_SUCCESS (stored 0%)
  adding: content/hotel/customers_json/.part-00000-8fcce1f1-38f2-442f-8a10-611b640bbe7c-c000.json.crc (stored 0%)
  adding: content/hotel/customers_json/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
!zip -r hotels_json.zip /content/hotel/hotels_json/
files.download("hotels_json.zip")

  adding: content/hotel/hotels_json/ (stored 0%)
  adding: content/hotel/hotels_json/part-00001-2dfaa305-7efa-4ff1-bb4c-94ca6082b68f-c000.json (deflated 61%)
  adding: content/hotel/hotels_json/.part-00001-2dfaa305-7efa-4ff1-bb4c-94ca6082b68f-c000.json.crc (stored 0%)
  adding: content/hotel/hotels_json/part-00000-2dfaa305-7efa-4ff1-bb4c-94ca6082b68f-c000.json (deflated 62%)
  adding: content/hotel/hotels_json/_SUCCESS (stored 0%)
  adding: content/hotel/hotels_json/.part-00000-2dfaa305-7efa-4ff1-bb4c-94ca6082b68f-c000.json.crc (stored 0%)
  adding: content/hotel/hotels_json/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
!zip -r rooms_json.zip /content/hotel/rooms_json/
files.download("rooms_json.zip")

  adding: content/hotel/rooms_json/ (stored 0%)
  adding: content/hotel/rooms_json/part-00001-33d96b40-df14-4470-9d60-d4bcc67d85de-c000.json (deflated 91%)
  adding: content/hotel/rooms_json/part-00000-33d96b40-df14-4470-9d60-d4bcc67d85de-c000.json (deflated 90%)
  adding: content/hotel/rooms_json/.part-00001-33d96b40-df14-4470-9d60-d4bcc67d85de-c000.json.crc (stored 0%)
  adding: content/hotel/rooms_json/_SUCCESS (stored 0%)
  adding: content/hotel/rooms_json/.part-00000-33d96b40-df14-4470-9d60-d4bcc67d85de-c000.json.crc (stored 0%)
  adding: content/hotel/rooms_json/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
!zip -r bookings_json.zip /content/hotel/bookings_json/
files.download("bookings_json.zip")

  adding: content/hotel/bookings_json/ (stored 0%)
  adding: content/hotel/bookings_json/part-00001-4703f08d-44ab-4699-9b6f-ac1586129980-c000.json (deflated 89%)
  adding: content/hotel/bookings_json/part-00000-4703f08d-44ab-4699-9b6f-ac1586129980-c000.json (deflated 89%)
  adding: content/hotel/bookings_json/.part-00000-4703f08d-44ab-4699-9b6f-ac1586129980-c000.json.crc (stored 0%)
  adding: content/hotel/bookings_json/_SUCCESS (stored 0%)
  adding: content/hotel/bookings_json/.part-00001-4703f08d-44ab-4699-9b6f-ac1586129980-c000.json.crc (stored 0%)
  adding: content/hotel/bookings_json/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Download Parquet folders**

In [13]:
!zip -r customers_parquet.zip /content/hotel/customers_parquet/
files.download("customers_parquet.zip")

  adding: content/hotel/customers_parquet/ (stored 0%)
  adding: content/hotel/customers_parquet/part-00000-c8e03e1a-c967-4c4e-84a7-d8fbb5366beb-c000.snappy.parquet (deflated 25%)
  adding: content/hotel/customers_parquet/part-00001-c8e03e1a-c967-4c4e-84a7-d8fbb5366beb-c000.snappy.parquet (deflated 25%)
  adding: content/hotel/customers_parquet/.part-00000-c8e03e1a-c967-4c4e-84a7-d8fbb5366beb-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/customers_parquet/_SUCCESS (stored 0%)
  adding: content/hotel/customers_parquet/.part-00001-c8e03e1a-c967-4c4e-84a7-d8fbb5366beb-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/customers_parquet/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
!zip -r hotels_parquet.zip /content/hotel/hotels_parquet/
files.download("hotels_parquet.zip")

  adding: content/hotel/hotels_parquet/ (stored 0%)
  adding: content/hotel/hotels_parquet/.part-00001-1e76c735-f2f1-4332-befb-a1eb12046c8b-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/hotels_parquet/_SUCCESS (stored 0%)
  adding: content/hotel/hotels_parquet/.part-00000-1e76c735-f2f1-4332-befb-a1eb12046c8b-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/hotels_parquet/part-00001-1e76c735-f2f1-4332-befb-a1eb12046c8b-c000.snappy.parquet (deflated 47%)
  adding: content/hotel/hotels_parquet/part-00000-1e76c735-f2f1-4332-befb-a1eb12046c8b-c000.snappy.parquet (deflated 48%)
  adding: content/hotel/hotels_parquet/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
!zip -r rooms_parquet.zip /content/hotel/rooms_parquet/
files.download("rooms_parquet.zip")

  adding: content/hotel/rooms_parquet/ (stored 0%)
  adding: content/hotel/rooms_parquet/part-00001-5bfda887-42b6-4966-aa30-ba52d9f4bd95-c000.snappy.parquet (deflated 46%)
  adding: content/hotel/rooms_parquet/_SUCCESS (stored 0%)
  adding: content/hotel/rooms_parquet/.part-00001-5bfda887-42b6-4966-aa30-ba52d9f4bd95-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/rooms_parquet/.part-00000-5bfda887-42b6-4966-aa30-ba52d9f4bd95-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/rooms_parquet/part-00000-5bfda887-42b6-4966-aa30-ba52d9f4bd95-c000.snappy.parquet (deflated 47%)
  adding: content/hotel/rooms_parquet/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
!zip -r bookings_parquet.zip /content/hotel/bookings_parquet/
files.download("bookings_parquet.zip")

  adding: content/hotel/bookings_parquet/ (stored 0%)
  adding: content/hotel/bookings_parquet/part-00000-d8fb55cf-7900-48f9-ae00-1998725855ae-c000.snappy.parquet (deflated 32%)
  adding: content/hotel/bookings_parquet/part-00001-d8fb55cf-7900-48f9-ae00-1998725855ae-c000.snappy.parquet (deflated 28%)
  adding: content/hotel/bookings_parquet/.part-00000-d8fb55cf-7900-48f9-ae00-1998725855ae-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/bookings_parquet/_SUCCESS (stored 0%)
  adding: content/hotel/bookings_parquet/.part-00001-d8fb55cf-7900-48f9-ae00-1998725855ae-c000.snappy.parquet.crc (stored 0%)
  adding: content/hotel/bookings_parquet/._SUCCESS.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>