# Create a Hotel Dataset

In [6]:
#Import data libraries
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(0)

# Sample size
n = 1000

# Generating dates
dates = pd.date_range(start="2022-01-01", periods=n, freq='D')

# Simulating data
data = {
    "Date": dates,
    "Hotel_ID": np.random.randint(100, 500, n),
    "ADR": np.round(np.random.normal(loc=150, scale=50, size=n), 2),
    "Occupancy Rate": np.round(np.random.uniform(0.5, 1.0, n), 2),
    "RevPAR": lambda x: np.round(x['ADR'] * x['Occupancy Rate'], 2),
    "Available Rooms": np.random.randint(50, 200, n),
    "Booked Rooms": lambda x: np.round(x['Available Rooms'] * x['Occupancy Rate']).astype(int),
    "Cancelations": np.random.randint(0, 10, n),
    "No Shows": np.random.randint(0, 5, n),
    "Deposits": np.round(np.random.uniform(500, 5000, n), 2),
    "Customer Type": np.random.choice(['Business', 'Leisure', 'Group'], n),
    "Stay Length": np.random.randint(1, 15, n),
    "Total Charges": lambda x: np.round(x['ADR'] * x['Stay Length'], 2),
    "Market Segment": np.random.choice(['Online', 'Direct', 'Agent'], n),
    "Booking Channel": np.random.choice(['Website', 'Phone', 'Agent'], n),
    "Special Requests": np.random.randint(0, 5, n),
    "Meal Plan": np.random.choice(['None', 'Breakfast', 'Half-board'], n),
    "Check-in Day of Week": np.random.choice(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], n),
    "Advance Booking Days": np.random.randint(0, 60, n),
    "Revenue from Extras": np.round(np.random.uniform(0, 200, n), 2)
}

# Create DataFrame with dependency columns
df = pd.DataFrame(data)
df['RevPAR'] = df.apply(data['RevPAR'], axis=1)
df['Booked Rooms'] = df.apply(data['Booked Rooms'], axis=1)
df['Total Charges'] = df.apply(data['Total Charges'], axis=1)

In [7]:
# Visualize data
df.head()

Unnamed: 0,Date,Hotel_ID,ADR,Occupancy Rate,RevPAR,Available Rooms,Booked Rooms,Cancelations,No Shows,Deposits,Customer Type,Stay Length,Total Charges,Market Segment,Booking Channel,Special Requests,Meal Plan,Check-in Day of Week,Advance Booking Days,Revenue from Extras
0,2022-01-01,272,215.96,0.82,177.09,82,67,3,1,2789.01,Leisure,2,431.92,Direct,Website,2,,Tue,2,165.82
1,2022-01-02,147,105.88,0.53,56.12,80,42,4,4,3567.95,Group,8,847.04,Direct,Agent,0,Half-board,Tue,58,2.45
2,2022-01-03,217,206.43,0.81,167.21,134,109,6,4,2189.25,Group,4,825.72,Agent,Agent,0,,Sun,17,63.95
3,2022-01-04,292,174.8,0.97,169.56,198,192,0,4,1082.61,Group,4,699.2,Online,Phone,3,Breakfast,Thu,1,40.03
4,2022-01-05,423,188.57,0.58,109.37,119,69,7,3,569.33,Leisure,5,942.85,Agent,Agent,4,,Fri,54,3.76


In [2]:
#Statistical Analysis
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,1000.0,2023-05-15 12:00:00,2022-01-01 00:00:00,2022-09-07 18:00:00,2023-05-15 12:00:00,2024-01-20 06:00:00,2024-09-26 00:00:00,
Hotel_ID,1000.0,302.946,100.0,205.0,298.5,405.25,498.0,115.281043
ADR,1000.0,149.29555,-2.31,116.88,149.26,180.3725,308.55,48.780211
Occupancy Rate,1000.0,0.75334,0.5,0.64,0.75,0.88,1.0,0.143547
RevPAR,1000.0,112.86448,-1.2,81.64,108.68,141.3525,282.21,44.015655
Available Rooms,1000.0,122.86,50.0,85.0,119.5,160.0,199.0,43.098258
Booked Rooms,1000.0,92.466,27.0,63.0,90.0,117.0,194.0,37.205579
Cancelations,1000.0,4.522,0.0,2.0,4.0,7.0,9.0,2.940436
No Shows,1000.0,1.99,0.0,1.0,2.0,3.0,4.0,1.410635
Deposits,1000.0,2731.07985,502.21,1577.775,2755.665,3871.8425,4997.65,1299.082282


In [3]:
# Check datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  1000 non-null   datetime64[ns]
 1   Hotel_ID              1000 non-null   int64         
 2   ADR                   1000 non-null   float64       
 3   Occupancy Rate        1000 non-null   float64       
 4   RevPAR                1000 non-null   float64       
 5   Available Rooms       1000 non-null   int64         
 6   Booked Rooms          1000 non-null   int64         
 7   Cancelations          1000 non-null   int64         
 8   No Shows              1000 non-null   int64         
 9   Deposits              1000 non-null   float64       
 10  Customer Type         1000 non-null   object        
 11  Stay Length           1000 non-null   int64         
 12  Total Charges         1000 non-null   float64       
 13  Market Segment     

In [5]:
#export to csv
df.to_csv('hotel_data.csv', index=False)