In [None]:
import pandas as pd
df = pd.read_csv('hotel_bookings.csv')
df.head()

In [3]:
df.shape

(119390, 33)

In [5]:
missing = df.isnull().sum()
missing[missing > 0]

children         4
country        488
agent        16340
company     112593
dtype: int64

In [6]:
# Fill missing values
df['children'] = df['children'].fillna(0)  # Assume no children if missing
df['country'] = df['country'].fillna('Unknown')
df['agent'] = df['agent'].fillna(0)  # 0 means no agent
df['company'] = df['company'].fillna(0)  # 0 means no company

# Verify
df.isnull().sum().sum()

np.int64(0)

In [None]:
df.dtypes


In [8]:
df['children'] = df['children'].astype(int)
df['agent'] = df['agent'].astype(int)
df['company'] = df['company'].astype(int)

df[['children', 'agent', 'company']].dtypes


children    int64
agent       int64
company     int64
dtype: object

In [9]:
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['arrival_date_month'] = df['arrival_date_month'].map(month_map)

df['arrival_date_month'].head()


0    7
1    4
2    9
3    8
4    9
Name: arrival_date_month, dtype: int64

In [None]:
df.columns.tolist()

In [19]:
df.duplicated().sum()

np.int64(63)

In [20]:
df = df.drop_duplicates()

In [21]:
df.shape


(119327, 31)

In [22]:
df['lead_time'].describe()

count    119327.000000
mean        103.967535
std         106.837568
min           0.000000
25%          18.000000
50%          69.000000
75%         160.000000
max         737.000000
Name: lead_time, dtype: float64

In [23]:
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols

Index(['hotel', 'meal', 'country', 'market_segment', 'distribution_channel',
       'reserved_room_type', 'assigned_room_type', 'deposit_type',
       'customer_type', 'city'],
      dtype='object')

In [24]:
for col in cat_cols:
    print(f"{col}: {df[col].nunique()} unique values")

hotel: 30 unique values
meal: 5 unique values
country: 178 unique values
market_segment: 8 unique values
distribution_channel: 5 unique values
reserved_room_type: 10 unique values
assigned_room_type: 12 unique values
deposit_type: 3 unique values
customer_type: 4 unique values
city: 15 unique values


In [26]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['country_encoded'] = le.fit_transform(df['country'])
df = df.drop(columns=['country'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country_encoded'] = le.fit_transform(df['country'])


In [27]:
df['country_encoded'].head()

0    135
1    135
2     59
3     59
4     59
Name: country_encoded, dtype: int64

In [28]:
cat_to_onehot = ['hotel', 'meal', 'market_segment', 'distribution_channel', 
                 'reserved_room_type', 'assigned_room_type', 
                 'deposit_type', 'customer_type', 'city']

df = pd.get_dummies(df, columns=cat_to_onehot, drop_first=True)

In [29]:
df.shape

(119327, 105)

In [31]:
# Remove negative ADR
df = df[df['adr'] >= 0]

# Cap extreme high values at 99th percentile
cap = df['adr'].quantile(0.99)
df['adr'] = df['adr'].clip(upper=cap)

In [32]:
df['adr'].describe()

count    119326.000000
mean        101.463675
std          46.909678
min           0.000000
25%          69.290000
50%          94.600000
75%         126.000000
max         252.000000
Name: adr, dtype: float64

In [33]:
df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

In [34]:
df[['stays_in_weekend_nights', 'stays_in_week_nights', 'total_stay']].head()

Unnamed: 0,stays_in_weekend_nights,stays_in_week_nights,total_stay
0,0,0,0
1,0,0,0
2,0,1,1
3,0,1,1
4,0,2,2


In [35]:
df.to_csv('hotel_bookings_cleaned.csv', index=False)

In [36]:
import os
os.path.getsize('hotel_bookings_cleaned.csv')

65772299