In [3]:
import pandas as pd
df = pd.read_csv('../data/hotel_bookings.csv')
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,city
0,Resort Hotel - Chandigarh,0,342,2024,July,30,27,0,0,2,...,,,0,Transient,0.0,0,0,Check-Out,16:40.9,Chandigarh
1,Resort Hotel - Mumbai,0,737,2024,April,17,28,0,0,2,...,,,0,Transient,0.0,0,0,Check-Out,56:21.5,Mumbai
2,Resort Hotel - Delhi,0,7,2024,September,37,10,0,1,1,...,,,0,Transient,75.0,0,0,Check-Out,46:25.7,Delhi
3,Resort Hotel - Kolkata,0,13,2024,August,33,14,0,1,1,...,304.0,,0,Transient,75.0,0,0,Check-Out,07:10.1,Kolkata
4,Resort Hotel - Lucknow,0,14,2024,September,37,14,0,2,2,...,240.0,,0,Transient,98.0,0,1,Check-Out,27:32.5,Lucknow


In [4]:
df.shape

(119390, 33)

In [5]:
missing = df.isnull().sum()
missing[missing > 0]

children         4
country        488
agent        16340
company     112593
dtype: int64

In [6]:
# Fill missing values
df['children'] = df['children'].fillna(0)  # Assume no children if missing
df['country'] = df['country'].fillna('Unknown')
df['agent'] = df['agent'].fillna(0)  # 0 means no agent
df['company'] = df['company'].fillna(0)  # 0 means no company

# Verify
df.isnull().sum().sum()

np.int64(0)

In [7]:
df.dtypes


hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [8]:
df['children'] = df['children'].astype(int)
df['agent'] = df['agent'].astype(int)
df['company'] = df['company'].astype(int)

df[['children', 'agent', 'company']].dtypes


children    int64
agent       int64
company     int64
dtype: object

In [9]:
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['arrival_date_month'] = df['arrival_date_month'].map(month_map)

df['arrival_date_month'].head()


0    7
1    4
2    9
3    8
4    9
Name: arrival_date_month, dtype: int64

In [10]:
df.columns.tolist()

['hotel',
 'is_canceled',
 'lead_time',
 'arrival_date_year',
 'arrival_date_month',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'reserved_room_type',
 'assigned_room_type',
 'booking_changes',
 'deposit_type',
 'agent',
 'company',
 'days_in_waiting_list',
 'customer_type',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests',
 'reservation_status',
 'reservation_status_date',
 'city']

In [11]:
df.duplicated().sum()

np.int64(0)

In [12]:
df = df.drop_duplicates()

In [13]:
df.shape


(119390, 33)

In [14]:
df['lead_time'].describe()

count    119390.000000
mean        104.011416
std         106.863097
min           0.000000
25%          18.000000
50%          69.000000
75%         160.000000
max         737.000000
Name: lead_time, dtype: float64

In [15]:
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols

Index(['hotel', 'meal', 'country', 'market_segment', 'distribution_channel',
       'reserved_room_type', 'assigned_room_type', 'deposit_type',
       'customer_type', 'reservation_status', 'reservation_status_date',
       'city'],
      dtype='object')

In [16]:
for col in cat_cols:
    print(f"{col}: {df[col].nunique()} unique values")

hotel: 30 unique values
meal: 5 unique values
country: 178 unique values
market_segment: 8 unique values
distribution_channel: 5 unique values
reserved_room_type: 10 unique values
assigned_room_type: 12 unique values
deposit_type: 3 unique values
customer_type: 4 unique values
reservation_status: 3 unique values
reservation_status_date: 34552 unique values
city: 15 unique values


In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['country_encoded'] = le.fit_transform(df['country'])
df = df.drop(columns=['country'])

In [18]:
df['country_encoded'].head()

0    135
1    135
2     59
3     59
4     59
Name: country_encoded, dtype: int64

In [19]:
cat_to_onehot = ['hotel', 'meal', 'market_segment', 'distribution_channel', 
                 'reserved_room_type', 'assigned_room_type', 
                 'deposit_type', 'customer_type', 'city']

df = pd.get_dummies(df, columns=cat_to_onehot, drop_first=True)

In [20]:
df.shape

(119390, 107)

In [21]:
# Remove negative ADR
df = df[df['adr'] >= 0]

# Cap extreme high values at 99th percentile
cap = df['adr'].quantile(0.99)
df['adr'] = df['adr'].clip(upper=cap)

In [22]:
df['adr'].describe()

count    119389.000000
mean        101.462101
std          46.904313
min           0.000000
25%          69.290000
50%          94.590000
75%         126.000000
max         252.000000
Name: adr, dtype: float64

In [23]:
df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

In [24]:
df[['stays_in_weekend_nights', 'stays_in_week_nights', 'total_stay']].head()

Unnamed: 0,stays_in_weekend_nights,stays_in_week_nights,total_stay
0,0,0,0
1,0,0,0
2,0,1,1
3,0,1,1
4,0,2,2


In [25]:
df.to_csv('../data/hotel_bookings_cleaned.csv', index=False)

In [26]:
import os
os.path.getsize('../data/hotel_bookings_cleaned.csv')

67910709