In [1]:
import pandas as pd
import numpy as np
from src.eda import add_engineered_features
from src.model_training import (preprocess_booking_data, split_data_train_val_test,
evaluate_model_with_threshold, evaluate_model, train_model)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
def evaluate_model_metrics(y, y_pred):
    metrics = {
            "precision": precision_score(y, y_pred, zero_division=0),
            "recall": recall_score(y, y_pred, zero_division=0),
            "f1_score": f1_score(y, y_pred, zero_division=0),
            "accuracy": accuracy_score(y, y_pred)
        }
    return metrics

In [3]:
# Fix random seed for reproducibility
RANDOM_SEED = 101
np.random.seed(RANDOM_SEED)

In [4]:
original_df = pd.read_csv("data/dataset.csv")
engineered_df = add_engineered_features(original_df)
processed_df, features_dict = preprocess_booking_data(engineered_df)
features_names = processed_df.columns.values[:-1].tolist()

In [5]:
# Split the data with stratisfy (all ys will have 7:3 ratio to represent the original data.)
X_train, X_val, X_test, y_train, y_val, y_test = split_data_train_val_test(processed_df, random_state=RANDOM_SEED)

In [6]:
models_training_results = []

In [7]:
default_models = {
    "decision_tree_default":  DecisionTreeClassifier(),
    "random_forest_default": RandomForestClassifier(),
    "gdbt_default": GradientBoostingClassifier(),
}

for model_name, clf in default_models.items():
    _, train_metrics, val_metrics = train_model(clf, X_train, y_train, X_val, y_val)
    train_metrics["model_name"] = model_name
    train_metrics["split"] = "training"
    
    val_metrics["model_name"] = model_name
    val_metrics["split"] = "validation"
    
    models_training_results.extend([val_metrics, train_metrics])

In [8]:
always_ones_train = evaluate_model_metrics(y_train, np.ones_like(y_train))
always_ones_train["model_name"] = "always_ones"
always_ones_train["split"] = "training"

always_zeros_train = evaluate_model_metrics(y_train, np.zeros_like(y_train))
always_zeros_train["model_name"] = "always_zeros"
always_zeros_train["split"] = "training"

always_ones_val = evaluate_model_metrics(y_val, np.ones_like(y_val))
always_ones_val["model_name"] = "always_ones"
always_ones_val["split"] = "validation"

always_zeros_val = evaluate_model_metrics(y_val, np.zeros_like(y_val))
always_zeros_val["model_name"] = "always_zeros"
always_zeros_val["split"] = "validation"

In [9]:
models_training_results.extend([always_ones_train, always_zeros_train,
                                always_ones_val, always_zeros_val])

In [10]:
fine_tuned_models = {
    "decision_tree_pruned":  DecisionTreeClassifier(**{'ccp_alpha': 0,
 'max_depth': 15,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5}),
    "random_forest_pruned": RandomForestClassifier(**{'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}),
    "decision_tree_pruned_ccp":  DecisionTreeClassifier(ccp_alpha=0.015)
}

for model_name, clf in fine_tuned_models.items():
    _, train_metrics, val_metrics = train_model(clf, X_train, y_train, X_val, y_val)
    train_metrics["model_name"] = model_name
    train_metrics["split"] = "training"
    
    val_metrics["model_name"] = model_name
    val_metrics["split"] = "validation"
    
    models_training_results.extend([val_metrics, train_metrics])

In [11]:
dd = pd.DataFrame(models_training_results).sort_values(by=["model_name", "split"])

In [19]:
dd[dd.split=="training"].sort_values(by="f1_score", ascending=False)

Unnamed: 0,precision,recall,f1_score,accuracy,model_name,split
3,0.979035,0.966314,0.972633,0.982182,random_forest_default,training
1,0.985665,0.959577,0.972446,0.982182,decision_tree_default,training
13,0.935714,0.882579,0.90837,0.941659,random_forest_pruned,training
11,0.836099,0.800289,0.817802,0.88316,decision_tree_pruned,training
5,0.778307,0.707892,0.741431,0.838221,gdbt_default,training
15,0.621685,0.767084,0.686773,0.770735,decision_tree_pruned_ccp,training
6,0.327657,1.0,0.493587,0.327657,always_ones,training
7,0.0,0.0,0.0,0.672343,always_zeros,training


In [20]:
dd[dd.split=="validation"].sort_values(by="f1_score", ascending=False)

Unnamed: 0,precision,recall,f1_score,accuracy,model_name,split
12,0.796491,0.76431,0.780069,0.85872,random_forest_pruned,validation
4,0.75,0.737374,0.743633,0.833333,gdbt_default,validation
10,0.749141,0.734007,0.741497,0.83223,decision_tree_pruned,validation
2,0.747368,0.717172,0.731959,0.827815,random_forest_default,validation
0,0.665672,0.750842,0.705696,0.794702,decision_tree_default,validation
14,0.618421,0.791246,0.694239,0.771523,decision_tree_pruned_ccp,validation
8,0.327815,1.0,0.493766,0.327815,always_ones,validation
9,0.0,0.0,0.0,0.672185,always_zeros,validation


In [30]:
def find_best_threshold(model, X, y, metric="recall", thresholds=np.arange(0.0, 1.05, 0.05)):
    best_threshold = 0.5
    best_metric_value = 0
    metric_values = []

    for threshold in thresholds:
        metrics = evaluate_model_with_threshold(model, X, y, threshold=threshold)
        metric_value = metrics[metric]
        metric_values.append((threshold, metric_value))

        if metric_value > best_metric_value:
            best_metric_value = metric_value
            best_threshold = threshold

    # Print the results and return the best threshold
    print(f"Best threshold for {metric}: {best_threshold} with score {best_metric_value}")
    return best_threshold, metric_values



In [31]:
best_threshold, metric_values = find_best_threshold(fine_tuned_models["decision_tree_pruned"], X_val, y_val, metric="recall")


Best threshold for f1_score: 0.4 with score 0.750788643533123


In [32]:
best_threshold, metric_values = find_best_threshold(fine_tuned_models["decision_tree_pruned_ccp"], X_val, y_val, metric="recall")


Best threshold for f1_score: 0.15000000000000002 with score 0.6942392909896603


In [34]:
best_threshold, metric_values = find_best_threshold(fine_tuned_models["random_forest_pruned"], X_val, y_val, metric="recall")


Best threshold for f1_score: 0.4 with score 0.7853736089030207


In [35]:
best_threshold, metric_values = find_best_threshold(default_models["gdbt_default"], X_val, y_val, metric="recall")


Best threshold for f1_score: 0.35000000000000003 with score 0.7554179566563467


In [36]:
best_threshold, metric_values = find_best_threshold(default_models["random_forest_default"], X_val, y_val, metric="recall")


Best threshold for f1_score: 0.45 with score 0.7446457990115322


In [37]:
best_threshold, metric_values = find_best_threshold(default_models["decision_tree_default"], X_val, y_val, metric="recall")


Best threshold for f1_score: 0.55 with score 0.7056962025316456


In [46]:
def evaluate_model_with_threshold_params(model_name, model, X, y, training_split="default", metric="recall",
                                        threshold=None):
    if not threshold:
        threshold, _ = find_best_threshold(model, X, y, metric=metric)
    metrics = evaluate_model_with_threshold(model, X, y, threshold=threshold)
    metrics["model_name"] = model_name
    metrics["split"] = training_split
    return metrics
    

In [41]:
customized_threshold_results = [
    evaluate_model_with_threshold_params("decision_tree_pruned",
                                         fine_tuned_models["decision_tree_pruned"],
                                        X_val, y_val),
    evaluate_model_with_threshold_params("decision_tree_pruned_ccp",
                                         fine_tuned_models["decision_tree_pruned_ccp"],
                                        X_val, y_val),
    evaluate_model_with_threshold_params("random_forest_pruned",
                                         fine_tuned_models["random_forest_pruned"],
                                        X_val, y_val),
    evaluate_model_with_threshold_params("gdbt_default",
                                         default_models["gdbt_default"],
                                        X_val, y_val)]

Best threshold for f1_score: 0.4 with score 0.750788643533123
Best threshold for f1_score: 0.15000000000000002 with score 0.6942392909896603
Best threshold for f1_score: 0.4 with score 0.7853736089030207
Best threshold for f1_score: 0.35000000000000003 with score 0.7554179566563467


In [43]:
pd.DataFrame(customized_threshold_results).sort_values(by="recall", ascending=False)

Unnamed: 0,precision,recall,f1_score,accuracy,model_name,split
2,0.743976,0.83165,0.785374,0.850993,random_forest_pruned,default
3,0.69914,0.821549,0.755418,0.825607,gdbt_default,default
0,0.706231,0.801347,0.750789,0.825607,decision_tree_pruned,default
1,0.618421,0.791246,0.694239,0.771523,decision_tree_pruned_ccp,default


In [44]:
customized_threshold_results_test = [
    evaluate_model_with_threshold_params("decision_tree_pruned",
                                         fine_tuned_models["decision_tree_pruned"],
                                        X_test, y_test),
    evaluate_model_with_threshold_params("decision_tree_pruned_ccp",
                                         fine_tuned_models["decision_tree_pruned_ccp"],
                                        X_test, y_test),
    evaluate_model_with_threshold_params("random_forest_pruned",
                                         fine_tuned_models["random_forest_pruned"],
                                        X_test, y_test),
    evaluate_model_with_threshold_params("gdbt_default",
                                         default_models["gdbt_default"],
                                        X_test, y_test)]

Best threshold for f1_score: 0.4 with score 0.7225806451612903
Best threshold for f1_score: 0.15000000000000002 with score 0.658321060382916
Best threshold for f1_score: 0.4 with score 0.7516233766233766
Best threshold for f1_score: 0.45 with score 0.7197346600331676


In [45]:
pd.DataFrame(customized_threshold_results_test).sort_values(by="recall", ascending=False)

Unnamed: 0,precision,recall,f1_score,accuracy,model_name,split
2,0.725705,0.779461,0.751623,0.831219,random_forest_pruned,default
0,0.693498,0.754209,0.722581,0.810259,decision_tree_pruned,default
3,0.70915,0.73064,0.719735,0.813569,gdbt_default,default
1,0.585079,0.752525,0.658321,0.744071,decision_tree_pruned_ccp,default


In [47]:
default_threshold_results_test = [
    evaluate_model_with_threshold_params("decision_tree_pruned",
                                         fine_tuned_models["decision_tree_pruned"],
                                        X_test, y_test,
                                        threshold=0.5),
    evaluate_model_with_threshold_params("decision_tree_pruned_ccp",
                                         fine_tuned_models["decision_tree_pruned_ccp"],
                                        X_test, y_test,
                                        threshold=0.5),
    evaluate_model_with_threshold_params("random_forest_pruned",
                                         fine_tuned_models["random_forest_pruned"],
                                        X_test, y_test,
                                        threshold=0.5),
    evaluate_model_with_threshold_params("gdbt_default",
                                         default_models["gdbt_default"],
                                        X_test, y_test,
                                        threshold=0.5)]

In [48]:
pd.DataFrame(default_threshold_results_test).sort_values(by="recall", ascending=False)

Unnamed: 0,precision,recall,f1_score,accuracy,model_name,split
2,0.752182,0.725589,0.738646,0.831771,random_forest_pruned,default
0,0.699839,0.734007,0.716516,0.809708,decision_tree_pruned,default
3,0.728242,0.690236,0.708729,0.81412,gdbt_default,default
1,0.585079,0.752525,0.658321,0.744071,decision_tree_pruned_ccp,default


In [50]:
for w, f in zip(fine_tuned_models["random_forest_pruned"].feature_importances_, features_names):
    print(f, w)

is_repeated_guest 0.004399626288572245
has_car 0.007026432072495347
has_special_requests 0.0895267960290642
lead_time 0.5075511683898182
has_children 0.010362869934147622
normalized_market_segment_value 0.09078046859830143
year_quarter 0.032415483745161865
arrival_month 0.10364181442195698
total_nights 0.1035063131483214
total_guests 0.05078902737216073


In [55]:
sorted(zip(fine_tuned_models["random_forest_pruned"].feature_importances_, features_names), reverse=True)

[(np.float64(0.5075511683898182), 'lead_time'),
 (np.float64(0.10364181442195698), 'arrival_month'),
 (np.float64(0.1035063131483214), 'total_nights'),
 (np.float64(0.09078046859830143), 'normalized_market_segment_value'),
 (np.float64(0.0895267960290642), 'has_special_requests'),
 (np.float64(0.05078902737216073), 'total_guests'),
 (np.float64(0.032415483745161865), 'year_quarter'),
 (np.float64(0.010362869934147622), 'has_children'),
 (np.float64(0.007026432072495347), 'has_car'),
 (np.float64(0.004399626288572245), 'is_repeated_guest')]

In [56]:
sorted(zip(fine_tuned_models["decision_tree_pruned"].feature_importances_, features_names), reverse=True)

[(np.float64(0.5044095631853523), 'lead_time'),
 (np.float64(0.16824102928149381), 'has_special_requests'),
 (np.float64(0.08204322702976986), 'normalized_market_segment_value'),
 (np.float64(0.08123451420459604), 'arrival_month'),
 (np.float64(0.06760344965773499), 'total_nights'),
 (np.float64(0.046719066914971394), 'total_guests'),
 (np.float64(0.02619086786355091), 'year_quarter'),
 (np.float64(0.013305162298911525), 'has_children'),
 (np.float64(0.008199353051346779), 'has_car'),
 (np.float64(0.0020537665122723524), 'is_repeated_guest')]

In [13]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     "max_depth": [5, 10, 15, 20, 25,None],
#     "min_samples_split": [2, 5, 10, 30],
#     "min_samples_leaf": [1, 5, 10, 15],
#     "max_features": ["sqrt", "log2"],
#     "ccp_alpha": [0.01, 0.001, 0.015, 0]
# }

# dt_clf = DecisionTreeClassifier(random_state=RANDOM_SEED)

# grid_search = GridSearchCV(dt_clf, param_grid, cv=5, scoring='recall', n_jobs=-1, verbose=1)
# grid_search.fit(X_train, y_train)

# best_dt_model = grid_search.best_estimator_
# best_params = grid_search.best_params_
# print(best_params, grid_search.best_score_)

In [14]:
decision_tree_params = {'ccp_alpha': 0,
 'max_depth': 15,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5}

In [15]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     "n_estimators": [10, 50, 100, 200, 500],
#     "max_depth": [5, 10, 15, 20, 25, None],
#     "min_samples_split": [2, 5, 10, 30],
#     "min_samples_leaf": [1, 5, 10, 15],
#     "max_features": ["sqrt", "log2"],
#     "bootstrap": [True, False],
# }

# rf_clf = RandomForestClassifier(random_state=RANDOM_SEED)

# grid_search = GridSearchCV(rf_clf, param_grid, cv=5, scoring='recall', n_jobs=-1, verbose=1)
# grid_search.fit(X_train, y_train)

# best_rf_model = grid_search.best_estimator_
# best_params = grid_search.best_params_
# print("Best Parameters:", best_params)
# print("Best Score:", grid_search.best_score_)

In [16]:
random_forest_params = {'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}