In [54]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer  
from sklearn.model_selection import KFold   
from statistics import mean
import joblib
from ydata_profiling import ProfileReport


# Gain data insight

In [None]:
raw_data = pd.read_csv(r'datasets\airbnb.csv')



# In[3]: STEP 3. DISCOVER THE DATA TO GAIN INSIGHTS
#region
# 3.1 Quick view of the data
print('\n____________ Dataset info ____________')
print(raw_data.info())              
print('\n____________ Some first data examples ____________')
print(raw_data.head(3)) 
print('\n____________ Counts on a feature ____________')
#print(raw_data['LEGAL DOCUMENTS'].value_counts()) 
print('\n____________ Statistics of numeric features ____________')
print(raw_data.describe())    
print('\n____________ Get specific rows and cols ____________')     
print(raw_data.iloc[[0,5,48], [2, 5]] ) # Refer using column ID


import seaborn as sns

bins = [0, 100, 200, 300, 400, 500, 1000, 1500, 2000, 2500, np.inf]
labels = ['0-100', '100-200', '200-300', '300-400', '400-500', '500-1000', '1000-1500', '1500-2000', '2000-2500', '2500+']

# Categorize the realSum data according to the bins (only for visualization)
binned_data = pd.cut(raw_data['realSum'], bins=bins, labels=labels, include_lowest=True)

## Plot the distribution of the realSum for Price range
plt.figure(figsize=(12, 6))
sns.countplot(x=binned_data, order=labels)
plt.title('Distribution of realSum ')
plt.xlabel('realSum Range')
plt.ylabel('Frequency')
# Save the plot to a file
plt.savefig('figures/realSum_distribution.png')
plt.show()






## Mean realSum of each city
# Group by 'city' and calculate the mean of 'realSum'
mean_realSum_by_city = raw_data.groupby('city')['realSum'].mean().sort_values()

# Plotting
plt.figure(figsize=(12, 8))
mean_realSum_by_city.plot(kind='bar', color='skyblue')

# Adding labels and title
plt.xlabel('City')
plt.ylabel('Mean realSum')
plt.title('Mean realSum by City')
plt.xticks(rotation=45)  # Rotate x labels for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()



In [None]:
"""


# Import the ProfileReport class from the ydata_profiling package
from ydata_profiling import ProfileReport

# Create a ProfileReport object to generate an exploratory data analysis report
report = ProfileReport(
    raw_data,  # The DataFrame containing the raw data to be analyzed
    title="Airbnb prices in European cities:",  # Title for the report

    # Custom descriptions for variables in the dataset to provide context in the report
    variables={
        "description": {
            "city": "The city where the accommodation offer is located.",  # Description for the 'city' column
            "realSum": "The total price in EUR for accommodating two people for two nights.",  # Description for the 'realSum' column
            "room_type": "The type of accommodation being offered (e.g., entire place, private room, shared room).",  # Description for the 'room_type' column
            "room_shared": "Binary variable indicating if the room is shared (1 for shared, 0 for not).",  # Description for the 'room_shared' column
            "room_private": "Binary variable indicating if the room is private (1 for private, 0 for not).",  # Description for the 'room_private' column
            "person_capacity": "The maximum number of guests that the accommodation can host.",  # Description for the 'person_capacity' column
            "host_is_superhost": "Binary variable indicating if the host is a superhost (1 for superhost, 0 for not).",  # Description for the 'host_is_superhost' column
            "multi": "Binary variable indicating if the listing is managed by a host with 2-4 offers (1 for yes, 0 for no).",  # Description for the 'multi' column
            "biz": "Binary variable indicating if the listing is managed by a host with more than 4 offers (1 for yes, 0 for no).",  # Description for the 'biz' column
            "cleanliness_rating": "Rating of the cleanliness of the accommodation, usually on a scale (e.g., 1 to 10).",  # Description for the 'cleanliness_rating' column
            "guest_satisfaction_overall": "Overall rating given by guests for the listing, typically on a scale (e.g., 1 to 10).",  # Description for the 'guest_satisfaction_overall' column
            "bedrooms": "Number of bedrooms in the accommodation (0 for studios).",  # Description for the 'bedrooms' column
            "dist": "Distance from the city center in kilometers.",  # Description for the 'dist' column
            "metro_dist": "Distance from the nearest metro station in kilometers.",  # Description for the 'metro_dist' column
            "attr_index": "Index indicating the attractiveness of the listing location based on nearby attractions.",  # Description for the 'attr_index' column
            "attr_index_norm": "Normalized attractiveness index (scaled between 0 and 100).",  # Description for the 'attr_index_norm' column
            "rest_index": "Index indicating the restaurant options available near the listing location.",  # Description for the 'rest_index' column
            "rest_index_norm": "Normalized restaurant index (scaled between 0 and 100).",  # Description for the 'rest_index_norm' column
            "lng": "Longitude coordinate of the listing location.",  # Description for the 'lng' column
            "lat": "Latitude coordinate of the listing location."  # Description for the 'lat' column
        }
    }
)

# Display the report in a notebook if possible
report.to_notebook_iframe()
"""



# Correlation

In [None]:
corr_matrix = raw_data.corr(numeric_only=True)

# Extract the correlation of 'realSum' with other variables
corr_realSum = corr_matrix[['realSum']]

# Sort the correlations in descending order
sorted_columns = corr_realSum.sort_values(by='realSum', ascending=False).index
sorted_corr_realSum = corr_realSum.loc[sorted_columns]

# Create a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(sorted_corr_realSum, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation with realSum')
plt.show()

# Prepare data

## Remove outlier

In [None]:
# Drop unnecessary columns
raw_data.drop(columns=["attr_index", "rest_index", "_id"], inplace=True)

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = raw_data["realSum"].quantile(0.25)
Q3 = raw_data["realSum"].quantile(0.75)

# Calculate the IQR
IQR = Q3 - Q1

# Define the lower and upper bound to identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers from the training set
raw_data = raw_data[(raw_data["realSum"] >= lower_bound) & (raw_data["realSum"] <= upper_bound)]


binned_data = pd.cut(raw_data['realSum'], bins=bins, labels=labels, include_lowest=True)

# Plot the distribution of the binned realSum
plt.figure(figsize=(12, 6))
sns.countplot(x=binned_data, order=labels)
plt.title('Distribution of realSum (After remove outlier)')
plt.xlabel('realSum Range')
plt.ylabel('Frequency')

# Save the plot to a file
plt.savefig('figures/realSum_distribution_after.png')


## Split data

In [59]:

from sklearn.model_selection import StratifiedShuffleSplit



method = 2
if method == 1: # Method 1: Randomly select 20% of data for test set. Used when data set is large
    from sklearn.model_selection import train_test_split
    train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=42)
else:  #Stratified split
# Define bins and labels for stratification
    bins = [0, 100, 200, 300, 400, 500, np.inf]
    labels = ['0-100', '100-200', '200-300', '300-400', '400-500', '500+']

# Create the 'PRICE RANGE' column for stratification
    raw_data["PRICE RANGE"] = pd.cut(raw_data["realSum"], bins=bins, labels=labels, include_lowest=True)

# Perform stratified split based on 'PRICE RANGE'
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

    for train_index, test_index in splitter.split(raw_data, raw_data["PRICE RANGE"]):
        train_set = raw_data.iloc[train_index].copy()
        test_set = raw_data.iloc[test_index].copy()
    
# Drop the 'PRICE RANGE' column after stratified split
    
    train_set.drop(columns="PRICE RANGE", inplace=True)
    test_set.drop(columns="PRICE RANGE", inplace=True)
    












## Data after split

In [None]:
train_set_labels = train_set["realSum"].copy()
train_set = train_set.drop(columns="realSum")


test_set_labels = test_set["realSum"].copy()
test_set = test_set.drop(columns="realSum")

print('\n____________ Split training and test set ____________')     
print(len(train_set), "training +", len(test_set), "test examples")
print(train_set.head(4))


# Define pipeline

## Column selector

In [48]:
#%% 4.4 Define pipelines for processing data. 
# INFO: Pipeline is a sequence of transformers (see Geron 2019, page 73). For step-by-step manipulation, see Details_toPipeline.py 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# 4.4.1 Define ColumnSelector: a transformer for choosing columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names
    def fit(self, dataframe, labels=None):
        return self
    def transform(self, dataframe):
        return dataframe[self.feature_names].values   

## Categories

In [None]:
       


cat_feat_names = list(train_set.select_dtypes(exclude=[np.number, bool]).columns)  #Categories column

# Combine numerical and boolean features into num_feat_names

print("Categorical features:", cat_feat_names)




# Define category pipelines
cat_pipeline = Pipeline([
    ('selector', ColumnSelector(cat_feat_names)), # Select categorical features
    ('imputer', SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="NO INFO", copy=True)),  # Handle missing values
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # One-hot encode categorical features
])   

## Numerical


  

In [None]:

num_feat_names = list(train_set.select_dtypes(include=[np.number]).columns) #Numerical column

print("Numerical features:", num_feat_names)

# Define numerical pipelines
num_pipeline = Pipeline([
    ('selector', ColumnSelector(num_feat_names)), #select numerical feature
    ('imputer', SimpleImputer(missing_values=np.nan, strategy="median", copy=True)), #Handle missing value
    ('std_scaler', StandardScaler(with_mean=True, with_std=True, copy=True)) #Scale feature
])








## Boolean

In [None]:
from sklearn.preprocessing import FunctionTransformer
bool_feat_names = list(train_set.select_dtypes(include=[bool]).columns)   #Boolean column
print("Boolean features:", bool_feat_names)
def boolean_to_binary(X):
    return X.astype(int)
# Define boolean pipelines
bool_pipeline = Pipeline([
    ('selector', ColumnSelector(bool_feat_names)),  # Select boolean features
    ('to_binary', FunctionTransformer(boolean_to_binary, validate=False)),  # Convert boolean to binary

    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
  
])

## Complete Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
# Combine the pipelines
full_pipeline = ColumnTransformer([ #Combined pipeline
    ("num_pipeline", num_pipeline, num_feat_names),
    ("cat_pipeline", cat_pipeline, cat_feat_names),
    ("bool_pipeline", bool_pipeline, bool_feat_names)
])


processed_train_set_val = full_pipeline.fit_transform(train_set)
joblib.dump(full_pipeline, r'models/full_pipeline.pkl')

# Collect feature names
num_feature_names = num_feat_names
cat_feature_names = full_pipeline.named_transformers_['cat_pipeline'].named_steps['cat_encoder'].get_feature_names_out(cat_feat_names)
bool_feature_names = bool_feat_names  

# Combine all feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names) + list(bool_feature_names)

# Convert to DataFrame
transformed_df = pd.DataFrame(processed_train_set_val, columns=all_feature_names)

# View the info and the  first few rows
print('\n____________ Processed feature values ____________')
print(transformed_df.info())
print(transformed_df.head())

print("Shape of the processed data:", transformed_df.shape)


# Train model

## Store model

In [42]:

def store_model(model, model_name = ""):
    # NOTE: sklearn.joblib faster than pickle of Python
    # INFO: can store only ONE object in a file
    if model_name == "": 
        model_name = type(model).__name__
    joblib.dump(model,'models/' + model_name + '_model.pkl')
def load_model(model_name):
    # Load objects into memory
    
    model = joblib.load('models/' + model_name + '_model.pkl')
   
    return model

## R2 Score and RMSE

In [50]:
# 5.1.2 Compute R2 score and root mean squared error
def r2score_and_rmse(model, train_data, labels): 
    r2score = model.score(train_data, labels)
    from sklearn.metrics import mean_squared_error
    prediction = model.predict(train_data)
    mse = mean_squared_error(labels, prediction)
    rmse = np.sqrt(mse)
    return r2score, rmse   

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(processed_train_set_val, train_set_labels)
print('\n____________ LinearRegression ____________')
print('Learned parameters: ', model.coef_, model.intercept_)

# Compute R2 score and root mean squared error
r2score, rmse = r2score_and_rmse(model, processed_train_set_val, train_set_labels)
print('\nR2 score (on training data, best=1):', r2score)
print("Root Mean Square Error: ", rmse.round(decimals=1))
        
#  Predict labels for some training instances
print("\nInput data: \n", train_set.iloc[0:9])
print("\nPredictions: ", model.predict(processed_train_set_val[0:9]).round(decimals=1))
print("Labels:      ", list(train_set_labels[0:9]))
store_model(model)




## Polinominal Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_feat_adder = PolynomialFeatures(degree = 2) # add high-degree features to the data
train_set_poly_added = poly_feat_adder.fit_transform(processed_train_set_val)
new_training = 10

model = LinearRegression()
model.fit(train_set_poly_added, train_set_labels)
    

#  Compute R2 score and root mean squared error
print('\n____________ Polinomial regression ____________')
r2score, rmse = r2score_and_rmse(model, train_set_poly_added, train_set_labels)
print('\nR2 score (on training data, best=1):', r2score)
print("Root Mean Square Error: ", rmse.round(decimals=1))
# Predict labels for some training instances
print("\nPredictions: ", model.predict(train_set_poly_added[0:9]).round(decimals=1))
print("Labels:      ", list(train_set_labels[0:9]))

store_model(model)

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100) # n_estimators: no. of trees
model.fit(processed_train_set_val, train_set_labels)


# Compute R2 score and root mean squared error
print('\n____________ RandomForestRegressor ____________')
r2score, rmse = r2score_and_rmse(model, processed_train_set_val, train_set_labels)
print('\nR2 score (on training data, best=1):', r2score)
print("Root Mean Square Error: ", rmse.round(decimals=1))
   
# Predict labels for some training instances
#print("Input data: \n", train_set.iloc[0:9])
print("\nPredictions: ", model.predict(processed_train_set_val[0:9]).round(decimals=1))
print("Labels:      ", list(train_set_labels[0:9]))

store_model(model)

## Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize and fit the model
model = GradientBoostingRegressor(n_estimators=100)  # You can adjust n_estimators and other hyperparameters
model.fit(processed_train_set_val, train_set_labels)

# Compute R2 score and root mean squared error
print('\n____________ GradientBoostingRegressor ____________')
r2score, rmse = r2score_and_rmse(model, processed_train_set_val, train_set_labels)
print('\nR2 score (on training data, best=1):', r2score)
print("Root Mean Square Error: ", rmse.round(decimals=1))

# Predict labels for some training instances
print("\nPredictions: ", model.predict(processed_train_set_val[0:9]).round(decimals=1))
print("Labels:      ", list(train_set_labels[0:9]))

# Store the model
store_model(model)

## ExtraTrees Regressor

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

# Initialize and fit the model
model = ExtraTreesRegressor(n_estimators=100)  # # n_estimators: no. of trees
model.fit(processed_train_set_val, train_set_labels)

# Compute R2 score and root mean squared error
print('\n____________ ExtraTreesRegressor ____________')
r2score, rmse = r2score_and_rmse(model, processed_train_set_val, train_set_labels)
print('\nR2 score (on training data, best=1):', r2score)
print("Root Mean Square Error: ", rmse.round(decimals=1))

# Predict labels for some training instances
print("\nPredictions: ", model.predict(processed_train_set_val[0:9]).round(decimals=1))
print("Labels:      ", list(train_set_labels[0:9]))

# Store the model
store_model(model)


## K-Nearest Neighbors Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=5)  # You can adjust n_neighbors and other hyperparameters
model.fit(processed_train_set_val, train_set_labels)

# Compute R2 score and root mean squared error
print('\n____________ KNeighborsRegressor ____________')
r2score, rmse = r2score_and_rmse(model, processed_train_set_val, train_set_labels)
print('\nR2 score (on training data, best=1):', r2score)
print("Root Mean Square Error: ", rmse.round(decimals=1))

# Predict labels for some training instances
print("\nPredictions: ", model.predict(processed_train_set_val[0:9]))
print("Labels:      ", list(train_set_labels[0:9]))

# Store the model
store_model(model, 'KNeighborsRegressor')



## Evaluate with K-Fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold
import joblib
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor

def evaluate_model(model, model_name, feature, label, cv):
    # Perform K-fold cross-validation on the model, using negative mean squared error (NMSE) as the scoring metric
    nmse_scores = cross_val_score(model, feature, label, cv=cv, scoring='neg_mean_squared_error')
    
    # Convert NMSE to RMSE (Root Mean Squared Error) by taking the square root of the negative scores
    rmse_scores = np.sqrt(-nmse_scores)
    
    # Save the RMSE scores for future reference
    joblib.dump(rmse_scores, f'saved_objects/{model_name}_rmse.pkl')
    
    
    print(f"{model_name} RMSE: ", rmse_scores.round(decimals=1))
    
   
    print("Avg. RMSE: ", np.mean(rmse_scores).round(decimals=1), '\n')

# Define a dictionary of models to evaluate, with each key being the model name and the value being the model object
models = {
    "LinearRegression": LinearRegression(),  #
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100),  
    "PolynomialRegression": Pipeline([  
        ('poly_feat_adder', PolynomialFeatures(degree=2)),  
        ('lin_reg', LinearRegression())
    ]),
    "GradientBoostingRegressor": GradientBoostingRegressor(), 
    "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=5), 
    "ExtraTreesRegressor": ExtraTreesRegressor(n_estimators=100)  
}

# Set up K-Fold cross-validation with 5 splits, shuffling the data before splitting
cv = KFold(n_splits=5, shuffle=True, random_state=37)

print('\n____________ K-fold cross-validation ____________')

# Flag to indicate whether to run new evaluations or load existing results
run_new_evaluation = 1

if run_new_evaluation:
    # If running new evaluations, iterate through each model in the dictionary
    for model_name, model in models.items():
        feature = processed_train_set_val  # The features to use in the model
        evaluate_model(model, model_name, feature, train_set_labels, cv)  # Evaluate each model using the helper function
else:
    # If not running new evaluations, load the saved RMSE scores and print them
    for model_name in models.keys():
        try:
            # Try to load the saved RMSE scores for the model
            rmse_scores = joblib.load(f'saved_objects/{model_name}_rmse.pkl')
            print(f"{model_name} RMSE: ", rmse_scores.round(decimals=1))
            print("Avg. RMSE: ", np.mean(rmse_scores).round(decimals=1), '\n')
        except FileNotFoundError:
            print(f"No saved RMSE scores found for {model_name}.")

# Test Result

In [None]:
""" Just for testing , not complete code """
#%% 7.3 Run on test data
best_model = ExtraTreesRegressor(n_estimators=100)  # You can adjust max_iter and other hyperparameters
best_model.fit(processed_train_set_val, train_set_labels)
full_pipeline = joblib.load(r'models/full_pipeline.pkl')


processed_test_set = full_pipeline.transform(test_set)  
# 7.3.1 Compute R2 score and root mean squared error
r2score, rmse = r2score_and_rmse(best_model, processed_test_set, test_set_labels)
print('\nPerformance on test data:')
print('R2 score (on test data, best=1):', r2score)
print("Root Mean Square Error: ", rmse.round(decimals=1))
# 7.3.2 Predict labels for some test instances
print("\nTest data: \n", test_set.iloc[0:9])
print("\nPredictions: ", best_model.predict(processed_test_set[0:9]).round(decimals=1))
print("Labels:      ", list(test_set_labels[0:9]),'\n')


# FINE-TUNE MODEL

In [None]:
import joblib  # Used for saving and loading models so we don't have to re-run searches every time
import numpy as np  # For mathematical operations, like calculating square root of errors
from sklearn.ensemble import ExtraTreesRegressor  # This is the machine learning model we're using
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold  # Functions to fine-tune our model
from scipy.stats import randint, uniform  # For generating random distributions during hyperparameter tuning

# Function to print out the results of the hyperparameter search
def print_search_result(grid_search, model_name=""):
    # Heading to indicate what model we're fine-tuning
    print("\n====== Fine-tune " + model_name + " ======")
    # Output the best combination of hyperparameters found
    print('Best hyperparameter combination: ', grid_search.best_params_)
    # Output the best RMSE (root mean squared error), a performance metric
    print('Best rmse: ', np.sqrt(-grid_search.best_score_))
    # Print out the performance of all the hyperparameter combinations that were tested
    print('Performance of hyperparameter combinations:')
    cv_results = grid_search.cv_results_  # Get all the results from cross-validation
    for (mean_score, params) in zip(cv_results["mean_test_score"], cv_results["params"]):
        # Print the RMSE for each combination of parameters, rounded to 1 decimal place
        print('rmse =', np.sqrt(-mean_score).round(decimals=1), params)

method = 2  # Switch between 1 (for GridSearchCV) and 2 (for RandomizedSearchCV)
# Method 2 is recommended as method 1 can be computationally expensive for many hyperparameters

# Set up cross-validation using KFold
# Splits the data into 5 parts (folds) and shuffles it to avoid bias, random_state ensures results are repeatable
cv = KFold(n_splits=5, shuffle=True, random_state=37)

# If we're using Method 1: GridSearchCV
if method == 1:
    run_new_search = True  # Decide whether to run a new search or load previous results

    if run_new_search:
        # Define the grid of hyperparameters to test
        param_grid = {
            'n_estimators': [100, 200, 300],  # Number of trees in the forest
            'max_features': ['sqrt', 'log2', None, 0.5, 0.8],  # How many features to consider when looking for the best split
            'max_depth': [10, 20, 50, 100],  # Maximum depth of each tree
            'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
            'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at a leaf node
            'bootstrap': [True, False],  # Whether bootstrap sampling is used (random sampling with replacement)
        }

        # Initialize the ExtraTreesRegressor model
        model = ExtraTreesRegressor(random_state=42)

        # Set up the GridSearchCV to try out all combinations of hyperparameters
        # GridSearchCV will use cross-validation to evaluate the performance of each combination
        # We're using negative mean squared error as the scoring metric
        
        grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='neg_mean_squared_error', 
                                   return_train_score=True, n_jobs=-1, refit=True)

        # Fit the model to the training data (processed_train_set_val and train_set_labels should be defined elsewhere)
        grid_search.fit(processed_train_set_val, train_set_labels)

        # Save the search results so we can load them later instead of running the search again
        joblib.dump(grid_search, 'saved_objects/ExtraTreesRegressor_gridsearch.pkl')

        # Print the results of the grid search
        print_search_result(grid_search, model_name="ExtraTreesRegressor")
    else:
        # Load previous search results
        grid_search = joblib.load('saved_objects/ExtraTreesRegressor_gridsearch.pkl')
        print_search_result(grid_search, model_name="ExtraTreesRegressor")

# If we're using Method 2: RandomizedSearchCV
elif method == 2:
    run_new_search = True  # Decide whether to run a new search or load previous results

    if run_new_search:
        # Define the random distribution of hyperparameters to sample from
        param_distributions = {
            'n_estimators': randint(50, 300),  # Randomly choose the number of trees
            'max_features': ['sqrt', 'log2', None] + list(uniform(0.1, 0.9).rvs(size=5)),  # Randomly choose number of features
            'max_depth': randint(5, 100),  # Randomly choose the tree depth
            'min_samples_split': randint(2, 10),  # Randomly choose the minimum samples needed to split
            'min_samples_leaf': randint(1, 10),  # Randomly choose the minimum samples needed at a leaf
            'bootstrap': [True, False],  # Randomly decide whether to use bootstrap sampling
        }

        # Initialize the ExtraTreesRegressor model
        model = ExtraTreesRegressor(random_state=42)

        # Set up the RandomizedSearchCV to randomly sample hyperparameters
        # RandomizedSearchCV will use cross-validation to evaluate the performance of each combination
        # Each iteration will try a different combination of hyperparameters, randomized from the distributions
        random_search = RandomizedSearchCV(model, param_distributions, n_iter=100, cv=cv, 
                                           scoring='neg_mean_squared_error', return_train_score=True, 
                                           random_state=42, n_jobs=-1, refit=True)

        # Fit the model to the training data
        random_search.fit(processed_train_set_val, train_set_labels)

        # Save the search results
        joblib.dump(random_search, 'saved_objects/ExtraTreesRegressor_randomsearch.pkl')

        # Print the results of the random search
        print_search_result(random_search, model_name="ExtraTreesRegressor")
    else:
        # Load previous random search results
        random_search = joblib.load('saved_objects/ExtraTreesRegressor_randomsearch.pkl')
        print_search_result(random_search, model_name="ExtraTreesRegressor")
