In [None]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# Load the data and replace with your CSV file path
df = pd.read_csv("hotel_bookings_clean.csv")
print(df.shape)
df.head(100)

(1500, 16)


Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,is_repeated_guest,previous_cancellations,total_of_special_requests,avg_daily_rate,booked_by_company,booked_by_agent,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0,68,14,2,3,2,0,0,1,130.90,0,1,0,0,0,1
1,0,152,14,1,4,1,0,0,0,42.00,1,0,0,0,0,1
2,0,11,49,0,3,1,0,0,0,36.00,1,0,0,0,0,1
3,1,6,27,0,1,2,0,0,0,139.00,0,1,0,0,1,0
4,1,335,38,0,1,2,0,1,0,85.00,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,46,11,0,1,1,0,0,0,25.00,0,1,0,0,0,1
96,0,100,40,0,4,2,0,0,1,140.00,0,1,0,0,0,1
97,0,39,50,2,8,3,0,0,2,130.90,0,1,0,0,1,0
98,0,243,39,2,2,1,0,0,0,80.70,0,1,0,0,0,1


In [None]:
# Check if there are any null values
df.isnull().sum()

Unnamed: 0,0
is_canceled,0
lead_time,0
arrival_date_week_number,0
stays_in_weekend_nights,0
stays_in_week_nights,0
adults,0
is_repeated_guest,0
previous_cancellations,0
total_of_special_requests,0
avg_daily_rate,0


In [None]:
# Check columns to make sure you have feature(s) and a target variable
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   is_canceled                    1500 non-null   int64  
 1   lead_time                      1500 non-null   int64  
 2   arrival_date_week_number       1500 non-null   int64  
 3   stays_in_weekend_nights        1500 non-null   int64  
 4   stays_in_week_nights           1500 non-null   int64  
 5   adults                         1500 non-null   int64  
 6   is_repeated_guest              1500 non-null   int64  
 7   previous_cancellations         1500 non-null   int64  
 8   total_of_special_requests      1500 non-null   int64  
 9   avg_daily_rate                 1500 non-null   float64
 10  booked_by_company              1500 non-null   int64  
 11  booked_by_agent                1500 non-null   int64  
 12  customer_type_Contract         1500 non-null   i

In [None]:
# Split the data into two DataFrames: X (features) and y (target variable)
X = df.iloc[:, 1:]  # Specify at least one column as feature(s)
y = df["is_canceled"]  # Specify one column as the target variable

# Split the data into train and test subsets
# You can adjust the test size and random state
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=123
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1050, 15), (450, 15), (1050,), (450,))

In [None]:
# Define parameters: these will need to be tuned to prevent overfitting and underfitting
params = {
    "kernel": "linear",  # Kernel type: 'linear', 'poly', 'rbf', 'sigmoid', or 'precomputed'
    "C": 1,  # Regularization parameter, squared l2 penalty
    "gamma": 0.01,  # Kernel coefficient (a float, 'scale', or 'auto') for 'rbf', 'poly' and 'sigmoid'
    "degree": 3,  # Degree of ‘poly’ kernel function
    "random_state": 123,
}

# Create a svm.SVC with the parameters above
clf = svm.SVC(**params)

# Train the SVM classifer on the train set
clf = clf.fit(X_train, y_train)

# Predict the outcomes on the test set
y_pred = clf.predict(X_test)

In [None]:
# Evaluate accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7488888888888889


In [None]:
# Define a parameter grid with distributions of possible parameters to use
rs_param_grid = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.1, 1, 10],
    "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1],
}

# Create a svm.SVC object
clf = svm.SVC(random_state=123)

# Instantiate RandomizedSearchCV() with clf and the parameter grid
clf_rs = RandomizedSearchCV(
    estimator=clf,
    param_distributions=rs_param_grid,
    cv=3,  # Number of folds
    n_iter=5,  # Number of parameter candidate settings to sample
    verbose=2,  # The higher this is, the more messages are outputed
    random_state=123,
)

# Train the model on the training set
clf_rs.fit(X_train, y_train)

# Print the best parameters and highest accuracy
print("Best parameters found: ", clf_rs.best_params_)
print("Best accuracy found: ", clf_rs.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ...................C=1, gamma=0.0001, kernel=linear; total time=   2.6s
[CV] END ...................C=1, gamma=0.0001, kernel=linear; total time=   2.6s
[CV] END ...................C=1, gamma=0.0001, kernel=linear; total time=   5.8s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time=   0.0s
[CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time=   0.0s
[CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time=   0.0s
[CV] END ..................C=0.1, gamma=0.01, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time=   0.0s
[CV] END ...................C=1, gamma=0.001, ker