In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
import os

# Load the dataset
data = pd.read_csv('hotel_dataset.csv')

# Preprocessing
data.fillna(0, inplace=True)

# Select relevant columns for training
columns_to_use = ['hotel', 'lead_time', 'arrival_date_week_number', 'adults', 'children', 'babies',
                   'meal', 'distribution_channel', 'reserved_room_type', 'deposit_type',
                   'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests',
                   'is_canceled']
data = data[columns_to_use]

# Encode the "hotel" feature using label encoding
label_encoder = LabelEncoder()
data['hotel'] = label_encoder.fit_transform(data['hotel'])
data['meal'] = label_encoder.fit_transform(data['meal'])
data['distribution_channel'] = label_encoder.fit_transform(data['distribution_channel'])
data['reserved_room_type'] = label_encoder.fit_transform(data['reserved_room_type'])
data['deposit_type'] = label_encoder.fit_transform(data['deposit_type'])

# Split data into features and target
X = data.drop('is_canceled', axis=1)
y = data['is_canceled']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(n_estimators=95, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
report = classification_report(y_test, y_pred)

# Print results
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

# Save individual tree models
trees_directory = 'models/individual_trees'
os.makedirs(trees_directory, exist_ok=True)
for i, tree in enumerate(model.estimators_):
    tree_filename = os.path.join(trees_directory, f'decision_tree_{i}.pkl')
    joblib.dump(tree, tree_filename)

# Note: We no longer save the original RandomForest model


Accuracy: 0.842742273222213
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88     14907
           1       0.83      0.73      0.78      8971

    accuracy                           0.84     23878
   macro avg       0.84      0.82      0.83     23878
weighted avg       0.84      0.84      0.84     23878



In [7]:
data.head()

Unnamed: 0,hotel,lead_time,arrival_date_week_number,adults,children,babies,meal,distribution_channel,reserved_room_type,deposit_type,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,is_canceled
0,1,342,27,2,0.0,0,0,1,2,0,0,0.0,0,0,0
1,1,737,27,2,0.0,0,0,1,2,0,0,0.0,0,0,0
2,1,7,27,1,0.0,0,0,1,0,0,0,75.0,0,0,0
3,1,13,27,1,0.0,0,0,0,0,0,0,75.0,0,0,0
4,1,14,27,2,0.0,0,0,3,0,0,0,98.0,0,1,0


In [8]:
# Access the feature names after one-hot encoding
feature_names = list(X_train.columns)

# Print the feature names
print(feature_names)


['hotel', 'lead_time', 'arrival_date_week_number', 'adults', 'children', 'babies', 'meal', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']
