<a href="https://colab.research.google.com/github/Stevo999/Addressbook/blob/main/randomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install required R packages
install.packages("randomForest")
install.packages("caret")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [33]:

# Load necessary libraries
library(randomForest)
library(caret)


In [34]:
# Load the dataset
data <- read.csv("hotels.csv")

In [35]:
# Ensure that data is a dataframe
data <- as.data.frame(data)

# Data Preprocessing
# Handle missing values
data <- na.omit(data)

In [36]:
# Check for duplicates
duplicates <- duplicated(data)

# If you want to remove duplicates
data <- data[!duplicates, ]


In [37]:
# Feature Engineering (Example: Adding a new feature 'total_people' by summing 'adults', 'children', and 'babies')
data$total_people <- data$adults + data$children + data$babies

In [38]:
# Selecting features (replace feature_names with actual column names)
selected_features <- c("lead_time", "stays_in_weekend_nights", "stays_in_week_nights", "adults", "children", "babies", "adr", "total_of_special_requests")

# Extract selected features and target variable
selected_data <- data[, c(selected_features, "is_canceled")]


In [39]:
# Convert is_canceled to a factor and ensure valid variable names
selected_data$is_canceled <- factor(selected_data$is_canceled)
levels(selected_data$is_canceled) <- make.names(levels(selected_data$is_canceled))

# Split the data into features and target variable
X <- selected_data[, -which(names(selected_data) == "is_canceled")]
y <- selected_data$is_canceled

In [40]:
# Split the data into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]

In [41]:
# Define a training control
ctrl <- trainControl(method = "cv", classProbs = TRUE)

In [44]:
# Parameter Grid for tuning
param_grid <- expand.grid(mtry = c(2, 3, 4), ntree = c(100, 300, 500))

In [45]:
# Model Selection and Training
model <- train(x = X_train, y = y_train, method = "rf", trControl = ctrl, ntree = 100, allowParallel = TRUE)


In [46]:
# Predictions
y_pred <- predict(model, X_test)

In [47]:
# Evaluate the model
accuracy <- confusionMatrix(data = y_pred, reference = y_test)$overall['Accuracy']
print(paste("Accuracy:", accuracy))

[1] "Accuracy: 0.753575924018766"


In [49]:

# Save the trained model
saveRDS(model, "random_forest_model.rds")