In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load the dataset
data = pd.read_csv("/content/combined data.csv")

In [None]:
# Debugging: Check dataset structure
print("Dataset columns:", data.columns)
print("Dataset shape:", data.shape)
print("Sample data:\n", data.head())

Dataset columns: Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')
Dataset shape: (54720, 32)
Sample data:
           hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            1         85               2018               July   
1  Resort Hotel            1         75               2018    

In [None]:
# Create new features and drop unnecessary columns
data['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']
data.drop(['reservation_status', 'reservation_status_date', 'arrival_date_month',
           'arrival_date_year', 'arrival_date_week_number',
           'arrival_date_day_of_month', 'stays_in_weekend_nights',
           'stays_in_week_nights'], axis=1, inplace=True)

In [None]:
# Debugging: Check 'adr' column
print("'adr' in dataset columns:", 'adr' in data.columns)
print("Number of missing values in 'adr':", data['adr'].isnull().sum())

'adr' in dataset columns: True
Number of missing values in 'adr': 0


In [None]:
# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns


In [None]:
# Exclude 'adr' from numerical_cols
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.drop('adr', errors='ignore')
categorical_cols = data.select_dtypes(include=['object']).columns

In [None]:
# Preprocessing for numerical and categorical data
num_preprocessor = SimpleImputer(strategy='mean')
cat_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preprocessor, numerical_cols),
        ('cat', cat_preprocessor, categorical_cols)
    ])

In [None]:
# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

In [None]:
# Define features and target
X = data.drop(columns=['adr'])  # Features
y = data['adr']                 # Target (Average Daily Rate)


In [None]:
# Debugging: Check feature and target shapes
print("Shape of feature matrix (X):", X.shape)
print("Shape of target vector (y):", y.shape)

Shape of feature matrix (X): (54720, 24)
Shape of target vector (y): (54720,)


In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the pipeline
pipeline.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"\nModel Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


Model Performance:
Mean Squared Error (MSE): 1084.89
R-squared (R2): 0.55
