In [1]:
#Basic Libraries 📚

import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt

In [2]:
# Basic Modules & Useful Installations

import warnings
warnings.simplefilter(action='ignore')
from geopy.distance import geodesic

In [3]:
# Machine Learning Libraries

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor


from sklearn.multioutput import MultiOutputRegressor  #This give us the option to fix 2 columns as targets (long, lat)
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


#from sklearn.preprocessing import LabelEncoder
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.neighbors import KNeighborsClassifier

#from sklearn.naive_bayes import GaussianNB
#from sklearn.mixture import GaussianMixture
#from sklearn.pipeline import make_pipeline


In [4]:
# Completed Caracterization 🔧 

def data_caracterization(df):
    
    columns = df.columns.tolist()
    (dfrows,dfcolumns) = df.shape
    df.reset_index()
    print(f'Caracterization:\n')
    print(f'Number of Rows: {dfrows}\n')
    print(f'Number of Columns: {dfcolumns}\n')
    print(f'Columns Names/Variables/Features: {columns}\n')
    df.info()
    print(f'\n Head:\n')
    
    return df.head(5)

In [5]:
# Construct Wild distance

def calculate_distance_row(row):
    
    coordinates_1 = (row['lat_buf'], row['long_buf'])
    coordinates_2 = (row['latitude'], row['longitude'])
    
    return geodesic(coordinates_1, coordinates_2).kilometers

In [6]:
# Construct reading relative path
relative_path = "my-jobcation-path/02_data_cleaning_phase/01_Preprocessing & Clean/campsites_chars.csv"

# Change directory
os.chdir('C:\\Users\\Oscar\\Documents\\00_Ironhack\\P3_Final\\') #Change for your own directory

# Check directory
current_directory = os.getcwd()
print("Current working directory is:", current_directory)

Current working directory is: C:\Users\Oscar\Documents\00_Ironhack\P3_Final


In [7]:
# Construct absolute paths

campsites_path = os.path.join(current_directory, relative_path)

In [8]:
# Campsites reading

if os.path.exists(campsites_path):
    try:
        # Try to read the CSV file
        data = pd.read_csv(campsites_path)
        print("The file has been loaded successfully.")
        # Now you can work with the 'data' DataFrame
    except Exception as e:
        print("An error occurred while reading the file:", e)
else:
    print("The file does not exist at the specified path:", campsites_path)

The file has been loaded successfully.


In [9]:
# Dataframe Creation

dfca = data.copy()


In [10]:
data_caracterization(dfca)

Caracterization:

Number of Rows: 3569

Number of Columns: 15

Columns Names/Variables/Features: ['name', 'address', 'city', 'website', 'schedule', 'longitude', 'latitude', 'distance_km', 'type', 'luxury', 'beach', 'wild', 'rating', 'long_buf', 'lat_buf']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3569 entries, 0 to 3568
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         3569 non-null   object 
 1   address      3569 non-null   object 
 2   city         3569 non-null   object 
 3   website      3569 non-null   object 
 4   schedule     3569 non-null   object 
 5   longitude    3569 non-null   float64
 6   latitude     3569 non-null   float64
 7   distance_km  3569 non-null   float64
 8   type         3569 non-null   object 
 9   luxury       3569 non-null   object 
 10  beach        3569 non-null   int64  
 11  wild         3569 non-null   int64  
 12  rating       3569 non-null   float64
 13  l

Unnamed: 0,name,address,city,website,schedule,longitude,latitude,distance_km,type,luxury,beach,wild,rating,long_buf,lat_buf
0,Camping Osuna,C. Jardines de Aranjuez s/n Madrid,Madrid,http://www.campingosuna.com/,Monday: 8:00 AM – 11:30 PM Tuesday: 8:00 AM – ...,-3.603361,40.453734,9.458117,City,Campsite,0,0,3.9,-3.70379,40.416775
1,Camping Osuna,C. Jardines de Aranjuez s/n Madrid,Alcalá de Henares,http://www.campingosuna.com/,Monday: 8:00 AM – 11:30 PM Tuesday: 8:00 AM – ...,-3.603361,40.453734,20.181317,Town,Campsite,0,0,3.9,-3.368802,40.48439
2,Camping Osuna,C. Jardines de Aranjuez s/n Madrid,Fuenlabrada,http://www.campingosuna.com/,Monday: 8:00 AM – 11:30 PM Tuesday: 8:00 AM – ...,-3.603361,40.453734,24.875383,Town,Campsite,0,0,3.9,-3.803548,40.290206
3,Camping Osuna,C. Jardines de Aranjuez s/n Madrid,Torrejón de Ardoz,http://www.campingosuna.com/,Monday: 8:00 AM – 11:30 PM Tuesday: 8:00 AM – ...,-3.603361,40.453734,10.851165,Town,Campsite,0,0,3.9,-3.475497,40.456755
4,Camping Osuna,C. Jardines de Aranjuez s/n Madrid,Parla,http://www.campingosuna.com/,Monday: 8:00 AM – 11:30 PM Tuesday: 8:00 AM – ...,-3.603361,40.453734,28.066198,Town,Campsite,0,0,3.9,-3.773987,40.237306


## ENCODING

In [12]:
# Encoding categoricals. We are going to use manual encoding because we need it for the model after.

type = {
    'City': 0,
    'Town': 1,
    'Village': 2,
}


luxury = {
    'Campsite': 0,
    'Glamping': 1,
    'Camper': 2
}


# Now let's do the mapping

dfca['type'] = dfca['type'].map(type)
dfca['luxury'] = dfca['luxury'].map(luxury)


In [13]:
encoding_column = ["type", "luxury", "beach", "wild", "rating", "distance_km", "long_buf", "lat_buf"] 

dfcoding = dfca[encoding_column]

In [14]:
dfcoding

Unnamed: 0,type,luxury,beach,wild,rating,distance_km,long_buf,lat_buf
0,0,0,0,0,3.9,9.458117,-3.703790,40.416775
1,1,0,0,0,3.9,20.181317,-3.368802,40.484390
2,1,0,0,0,3.9,24.875383,-3.803548,40.290206
3,1,0,0,0,3.9,10.851165,-3.475497,40.456755
4,1,0,0,0,3.9,28.066198,-3.773987,40.237306
...,...,...,...,...,...,...,...,...
3564,2,0,0,1,4.4,22.713277,-0.370797,43.295100
3565,2,0,0,1,4.5,47.411886,-0.370797,43.295100
3566,2,0,0,1,4.5,32.475489,-0.370797,43.295100
3567,2,0,0,1,3.9,55.532844,-0.370797,43.295100


## SETTING TARGET (Y)

In [16]:
target_columns = ["long_buf", "lat_buf"]

X = dfcoding.drop(target_columns, axis = 1)
y = dfcoding[target_columns]

In [17]:
# Splitting the data into train and test sets (80% train, 20% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regressor

In [19]:
# Without MultiOutPut

# Define and fit Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = linear_model.predict(X_test)

In [20]:
mae = mean_absolute_error(y_test, y_pred_lr)
mse = mean_squared_error(y_test, y_pred_lr)
rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
r2 = r2_score(y_test, y_pred_lr)

In [21]:
print("Lineal Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Lineal Regressor:

Mean Absolute Error (MAE): 6.270605458503447
Mean Squared Error (MSE): 166.2073927904342
Root Mean Squared Error (RMSE): 11.864370423584518
R^2 Score: 0.044247285085440735


In [22]:
# With 

# Define and fit Linear Regression model

multi_target_linear_model = MultiOutputRegressor(LinearRegression())
multi_target_linear_model.fit(X_train, y_train)

# Make predictions

y_pred_lr = linear_model.predict(X_test)

In [23]:
mae = mean_absolute_error(y_test, y_pred_lr)
mse = mean_squared_error(y_test, y_pred_lr)
rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
r2 = r2_score(y_test, y_pred_lr)

In [24]:
print("Lineal Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Lineal Regressor:

Mean Absolute Error (MAE): 6.270605458503447
Mean Squared Error (MSE): 166.2073927904342
Root Mean Squared Error (RMSE): 11.864370423584518
R^2 Score: 0.044247285085440735


## RIDGE REGRESSION without and with MultiOutPut

In [26]:
ridge_model = RidgeCV(alphas=[0.1, 1.0, 10.0])

ridge_model.fit(X_train, y_train)

y_pred_rg = ridge_model.predict(X_test)

In [27]:
mae = mean_absolute_error(y_test, y_pred_rg)
mse = mean_squared_error(y_test, y_pred_rg)
rmse = mean_squared_error(y_test, y_pred_rg, squared=False)
r2 = r2_score(y_test, y_pred_rg)

In [28]:
print("Histogram Gradient Boosting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor:

Mean Absolute Error (MAE): 6.26924831609172
Mean Squared Error (MSE): 166.2546618200802
Root Mean Squared Error (RMSE): 11.865252566511195
R^2 Score: 0.0442945497876201


In [29]:
# Whitout

multi_target_ridge_model = MultiOutputRegressor(RidgeCV(alphas=[0.1, 1.0, 10.0]))

multi_target_ridge_model.fit(X_train, y_train)

y_pred_rg = multi_target_ridge_model.predict(X_test)

In [30]:
mae = mean_absolute_error(y_test, y_pred_rg)
mse = mean_squared_error(y_test, y_pred_rg)
rmse = mean_squared_error(y_test, y_pred_rg, squared=False)
r2 = r2_score(y_test, y_pred_rg)

In [31]:
print("Histogram Gradient Boosting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor:

Mean Absolute Error (MAE): 6.2692483268439485
Mean Squared Error (MSE): 166.25466197702434
Root Mean Squared Error (RMSE): 11.865252571401122
R^2 Score: 0.04429454916700021


## KNeighborsRegressor without and with MultiOutPut

In [33]:
# Without

knn_model = KNeighborsRegressor(n_neighbors=5)

knn_model.fit(X_train, y_train)

y_pred_kn = knn_model.predict(X_test)

In [34]:
mae = mean_absolute_error(y_test, y_pred_kn)
mse = mean_squared_error(y_test, y_pred_kn)
rmse = mean_squared_error(y_test, y_pred_kn, squared=False)
r2 = r2_score(y_test, y_pred_kn)

In [35]:
print("Histogram Gradient Boosting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor:

Mean Absolute Error (MAE): 6.538696148361353
Mean Squared Error (MSE): 162.3802949968801
Root Mean Squared Error (RMSE): 11.728869719001231
R^2 Score: 0.06551328096876885


In [36]:
multi_target_knn_model =  MultiOutputRegressor(KNeighborsRegressor(n_neighbors=5))

multi_target_knn_model.fit(X_train, y_train)

y_pred_kn = multi_target_knn_model.predict(X_test)

In [37]:
mae = mean_absolute_error(y_test, y_pred_kn)
mse = mean_squared_error(y_test, y_pred_kn)
rmse = mean_squared_error(y_test, y_pred_kn, squared=False)
r2 = r2_score(y_test, y_pred_kn)

In [38]:
print("Histogram Gradient Boosting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor:

Mean Absolute Error (MAE): 6.538696148361344
Mean Squared Error (MSE): 162.38029499688002
Root Mean Squared Error (RMSE): 11.728869719001228
R^2 Score: 0.06551328096876924


## Random Forest Regressor without and with MultiOutPut

In [40]:
# Without MultiOutPut

# Define and fit Random Forest Regressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

In [41]:
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
r2 = r2_score(y_test, y_pred_rf)

In [42]:
print("Histogram Gradient Boosting Regressor/n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor/n
Mean Absolute Error (MAE): 6.731382829514004
Mean Squared Error (MSE): 163.94046383502257
Root Mean Squared Error (RMSE): 11.770919212542047
R^2 Score: 0.06208649878381567


In [43]:
# With MultiOutPut

multi_target_rf_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

multi_target_rf_model.fit(X_train, y_train)

y_pred_rf = multi_target_rf_model.predict(X_test)


In [44]:
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
r2 = r2_score(y_test, y_pred_rf)

In [45]:
print("Histogram Gradient Boosting Regressor/n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor/n
Mean Absolute Error (MAE): 6.719734970334034
Mean Squared Error (MSE): 165.64837618108265
Root Mean Squared Error (RMSE): 11.806327156247992
R^2 Score: 0.06228587797687901


## Decision Tree Regressor without and with MultiOutPut

In [47]:
# Without MultiOutPut

from sklearn.tree import DecisionTreeRegressor

# Define and fit Decision Tree Regressor model


dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)


In [48]:
mae = mean_absolute_error(y_test, y_pred_dt)
mse = mean_squared_error(y_test, y_pred_dt)
rmse = mean_squared_error(y_test, y_pred_dt, squared=False)
r2 = r2_score(y_test, y_pred_dt)

In [49]:
print("Histogram Gradient Boosting Regressor/n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor/n
Mean Absolute Error (MAE): 8.114917962885155
Mean Squared Error (MSE): 244.9732813683239
Root Mean Squared Error (RMSE): 14.749189341155544
R^2 Score: -0.598685670688742


In [50]:
#With MultiOutPut

from sklearn.tree import DecisionTreeRegressor

# Define and fit Decision Tree Regressor model

multi_target_dt_model = MultiOutputRegressor(DecisionTreeRegressor())

multi_target_dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = multi_target_dt_model.predict(X_test)


In [51]:
mae = mean_absolute_error(y_test, y_pred_dt)
mse = mean_squared_error(y_test, y_pred_dt)
rmse = mean_squared_error(y_test, y_pred_dt, squared=False)
r2 = r2_score(y_test, y_pred_dt)

In [52]:
print("Histogram Gradient Boosting Regressor/n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor/n
Mean Absolute Error (MAE): 7.992156549859943
Mean Squared Error (MSE): 250.66579452614758
Root Mean Squared Error (RMSE): 14.65799786705933
R^2 Score: -0.485530925983806


## Hist Gradient Boosting Regressor (Only with MultiOutPut)

In [54]:
HistGradientBoostingRegressor = HistGradientBoostingRegressor(max_bins=200, max_iter=100, max_leaf_nodes = 20, max_depth = 7, min_samples_leaf = 5, warm_start = True)

multi_target_hgb_model = MultiOutputRegressor(HistGradientBoostingRegressor)

multi_target_hgb_model.fit(X_train, y_train)

y_pred_hgd = multi_target_hgb_model.predict(X_test)


In [55]:
mae = mean_absolute_error(y_test, y_pred_hgd)
mse = mean_squared_error(y_test, y_pred_hgd)
rmse = mean_squared_error(y_test, y_pred_hgd, squared=False)
r2 = r2_score(y_test, y_pred_hgd)

In [56]:
print("Histogram Gradient Boosting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor:

Mean Absolute Error (MAE): 6.285766846476234
Mean Squared Error (MSE): 152.77327824322808
Root Mean Squared Error (RMSE): 11.31830777687893
R^2 Score: 0.14242782384065777


## AdaBoost Regressor (Only with MultiOutPut)

In [58]:
multi_target_ada_model = MultiOutputRegressor(AdaBoostRegressor(n_estimators=100, learning_rate=1.0, random_state=42))


multi_target_ada_model.fit(X_train, y_train)

y_pred_ada = multi_target_ada_model.predict(X_test)


In [59]:
mae = mean_absolute_error(y_test, y_pred_ada)
mse = mean_squared_error(y_test, y_pred_ada)
rmse = mean_squared_error(y_test, y_pred_ada, squared=False)
r2 = r2_score(y_test, y_pred_ada)

In [60]:
print("Histogram Gradient Boosting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor:

Mean Absolute Error (MAE): 8.259817147882389
Mean Squared Error (MSE): 183.30652972414833
Root Mean Squared Error (RMSE): 12.74061529109079
R^2 Score: -0.18662544677262521


## Stacking Regressor  (Only with MultiOutPut)

In [62]:

# Define base regressors
base_regressors = [
    ('ridge', RidgeCV()),
    ('gb', GradientBoostingRegressor())
]

# Define meta regressor
meta_regressor = RandomForestRegressor()

# Define StackingRegressor model
multi_target_stacking_model = MultiOutputRegressor(StackingRegressor(
    estimators=base_regressors,
    final_estimator=meta_regressor
))

# Fit StackingRegressor model
multi_target_stacking_model.fit(X_train, y_train)

# Make predictions
y_pred_stacking = multi_target_stacking_model.predict(X_test)

# Print predictions
print(y_pred_stacking)


[[ 0.81883227 42.88275755]
 [-4.22218075 41.3745211 ]
 [-1.50792308 41.66221641]
 ...
 [ 6.44521259 39.04791959]
 [ 3.80257325 43.48488203]
 [ 5.30453357 40.00238057]]


In [63]:
mae = mean_absolute_error(y_test, y_pred_stacking)
mse = mean_squared_error(y_test, y_pred_stacking)
rmse = mean_squared_error(y_test, y_pred_stacking, squared=False)
r2 = r2_score(y_test, y_pred_stacking)

In [64]:
print("Histogram Gradient Boosting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Histogram Gradient Boosting Regressor:

Mean Absolute Error (MAE): 6.682280211557913
Mean Squared Error (MSE): 156.69918695363128
Root Mean Squared Error (RMSE): 11.664453279900068
R^2 Score: 0.039428255089136


## Voting Regressor (Only with MultiOutPut)

In [66]:
# Define base regressors
base_regressors = [
    ('ridge', RidgeCV()),
    ('rf', RandomForestRegressor()),
    ('gb', GradientBoostingRegressor())
]

# Define VotingRegressor model
multi_target_voting_model = MultiOutputRegressor(VotingRegressor(estimators=base_regressors, weights = [1,3,6]))

# Fit VotingRegressor model
multi_target_voting_model.fit(X_train, y_train)

# Make predictions
y_pred_voting = multi_target_voting_model.predict(X_test)


In [67]:
mae = mean_absolute_error(y_test, y_pred_voting)
mse = mean_squared_error(y_test, y_pred_voting)
rmse = mean_squared_error(y_test, y_pred_voting, squared=False)
r2 = r2_score(y_test, y_pred_voting)

In [68]:
print("Voting Regressor:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Voting Regressor:

Mean Absolute Error (MAE): 6.128951106197746
Mean Squared Error (MSE): 148.28898348915618
Root Mean Squared Error (RMSE): 11.126342607048182
R^2 Score: 0.17626319987038203


## Voting Regressor Modified (Only with MultiOutPut)

In [70]:

# Define base regressors
base_regressors = [
    ('ridge', RidgeCV()),
    ('rf', RandomForestRegressor()),
    ('ada', AdaBoostRegressor()),
    ('kn', KNeighborsRegressor()),
    ('gb', GradientBoostingRegressor())
]

# Define VotingRegressor model
multi_target_voting_5_base = MultiOutputRegressor(VotingRegressor(estimators=base_regressors))

# Fit VotingRegressor model
multi_target_voting_5_base.fit(X_train, y_train)

# Make predictions
y_pred_voting_model_5_base = multi_target_voting_5_base.predict(X_test)


In [71]:
mae = mean_absolute_error(y_test, y_pred_voting_model_5_base)
mse = mean_squared_error(y_test, y_pred_voting_model_5_base)
rmse = mean_squared_error(y_test, y_pred_voting_model_5_base, squared=False)
r2 = r2_score(y_test, y_pred_voting_model_5_base)

In [72]:
print("Voting Regressor 5 base regressors:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Voting Regressor 5 base regressors:

Mean Absolute Error (MAE): 6.339132824583852
Mean Squared Error (MSE): 146.06194067517322
Root Mean Squared Error (RMSE): 11.084057138049348
R^2 Score: 0.17399646434135568


In [73]:

# Define base regressors
base_regressors = [
    ('ridge', RidgeCV()),
    ('rf', RandomForestRegressor()),
    ('ada', AdaBoostRegressor()),
    ('kn', KNeighborsRegressor()),
    ('gb', GradientBoostingRegressor())
]

# Define VotingRegressor model
multi_target_voting_model_mod1 = MultiOutputRegressor(VotingRegressor(estimators=base_regressors, weights = [3,3,2,2,3]))

# Fit VotingRegressor model
multi_target_voting_model_mod1.fit(X_train, y_train)

# Make predictions
y_pred_voting_model_mod1 = multi_target_voting_model_mod1.predict(X_test)


In [74]:
mae = mean_absolute_error(y_test, y_pred_voting_model_mod1)
mse = mean_squared_error(y_test, y_pred_voting_model_mod1)
rmse = mean_squared_error(y_test, y_pred_voting_model_mod1, squared=False)
r2 = r2_score(y_test, y_pred_voting_model_mod1)

In [75]:
print("Voting Regressor 5 base regressors:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Voting Regressor 5 base regressors:

Mean Absolute Error (MAE): 6.216885964854067
Mean Squared Error (MSE): 146.65411391338307
Root Mean Squared Error (RMSE): 11.095627911355068
R^2 Score: 0.1745381564925465


In [76]:
# Second Option


# Define base regressors
base_regressors = [
    ('ridge', RidgeCV()),
    ('rf', RandomForestRegressor()),
    ('ada', AdaBoostRegressor()),
    ('kn', KNeighborsRegressor()),
    ('gb', GradientBoostingRegressor())
]

# Define VotingRegressor model
multi_target_voting_model_mod2 = MultiOutputRegressor(VotingRegressor(estimators=base_regressors, weights = [2,3,2,3,3]))

# Fit VotingRegressor model
multi_target_voting_model_mod2.fit(X_train, y_train)

# Make predictions
y_pred_voting_model_mod2 = multi_target_voting_model_mod2.predict(X_test)


In [77]:
mae = mean_absolute_error(y_test, y_pred_voting_model_mod2)
mse = mean_squared_error(y_test, y_pred_voting_model_mod2)
rmse = mean_squared_error(y_test, y_pred_voting_model_mod2, squared=False)
r2 = r2_score(y_test, y_pred_voting_model_mod2)

In [78]:
print("Voting Regressor 5 base regressors different weights:\n")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Voting Regressor 5 base regressors different weights:

Mean Absolute Error (MAE): 6.238867451063272
Mean Squared Error (MSE): 147.35929052105345
Root Mean Squared Error (RMSE): 11.120612609579702
R^2 Score: 0.1711606933333057


## CROSSVAL SCORE

In [85]:
cvs_multi_target_hgb_model = cross_val_score(multi_target_hgb_model, X, y, cv=5, scoring='r2')

cvs_multi_target_voting_model = cross_val_score(multi_target_voting_model, X, y, cv=5, scoring='r2')

cvs_multi_target_voting_5_base = cross_val_score(multi_target_voting_5_base, X, y, cv=5, scoring='r2')

cvs_multi_target_voting_model_mod1 = cross_val_score(multi_target_voting_model_mod1, X, y, cv=5, scoring='r2')

cvs_multi_target_voting_model_mod2 = cross_val_score(multi_target_voting_model_mod2, X, y, cv=5, scoring='r2')



# Imprimir los puntajes de validación cruzada
print("Cross-Validation Scores hgb_model (R^2):", cvs_multi_target_hgb_model)
print("Cross-Validation Scores voting_model (R^2):", cvs_multi_target_voting_model)
print("Cross-Validation Scores voting_model (R^2):", cvs_multi_target_voting_5_base)
print("Cross-Validation Scores voting_model_mod1(R^2):", cvs_multi_target_voting_model_mod1)
print("Cross-Validation Scores voting_model_mod2 (R^2):", cvs_multi_target_voting_model_mod2)

Cross-Validation Scores hgb_model (R^2): [-8.0234557  -0.66184499 -0.13005165 -5.68839892 -5.8564843 ]
Cross-Validation Scores voting_model (R^2): [-6.72546116 -0.59176742 -0.14446817 -5.292831   -4.71207999]
Cross-Validation Scores voting_model (R^2): [-5.7557011  -0.56728783 -0.09953726 -5.80952773 -5.43566772]
Cross-Validation Scores voting_model_mod1(R^2): [-5.58765449 -0.55762808 -0.09655452 -5.66918872 -5.24304232]
Cross-Validation Scores voting_model_mod2 (R^2): [-5.694543   -0.5748722  -0.12519895 -5.55815098 -5.25759625]


## GRID SEARCH CV

In [82]:
r2 = r2_score(y_test, y_pred_voting)

## SAVE MODEL

In [84]:
# Construct reading relative path
#relative_path8 = "my-jobcation-path/02_data_cleaning_phase/01_Preprocessing & Clean/coworkings_chars.csv"
#relative_path9 = "my-jobcation-path/02_data_cleaning_phase/01_Preprocessing & Clean/campsites_chars.csv"

# Construct absolute paths
coworkings_save_path = os.path.join(current_directory, relative_path8)
campsites_save_path = os.path.join(current_directory, relative_path9)

NameError: name 'relative_path8' is not defined

In [None]:
# Save and Encoding 

dfcoworks.to_csv(coworkings_save_path, index=False, encoding='utf-8')
dfcamps.to_csv(campsites_save_path, index=False, encoding='utf-8')

In [None]:
dfcoding.info()