In [None]:
!pip install missingno


from google.colab import files
import pandas as pd
import missingno as msno
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from sklearn.model_selection import cross_val_score, KFold

from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import train_test_split
from collections import Counter



In [None]:
uploaded = files.upload()

Saving OSA_DB_UPM.xlsx to OSA_DB_UPM.xlsx


In [None]:
# Specify the file name
file_name = 'OSA_DB_UPM.xlsx'

# Read the Excel file into a DataFrame
df_tmp = pd.read_excel(file_name)
df_tmp.head(10)

Unnamed: 0,IAH,Weight,Height,Age,PerCervical,Gender
0,29.6,119,174,56,48.0,hombre
1,19.7,78,168,39,42.0,hombre
2,9.0,80,173,32,40.0,hombre
3,2.0,109,190,32,42.0,hombre
4,34.0,86,169,39,42.0,hombre
5,60.0,145,172,47,44.0,hombre
6,22.0,72,165,40,42.0,hombre
7,11.0,80,180,28,38.0,hombre
8,3.7,90,180,36,40.0,hombre
9,7.0,50,158,50,35.0,mujer


# Training

In [None]:
df_tmp['BMI'] = df_tmp['Weight']/((df_tmp['Height']/100)**2)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_tmp[['PerCervical', 'BMI', 'Age', 'Height', 'Weight']])

df_tmp['BMI'] = scaled_features[:, 1]
df_tmp['PerCervical'] = scaled_features[:, 0]
df_tmp['Age'] = scaled_features[:, 2]
df_tmp['Height'] = scaled_features[:, 3]
df_tmp['Weight'] = scaled_features[:, 4]

##### Linear Regression

In [None]:
features = ['PerCervical', 'BMI', 'Age']

# convert it into numerical matrix
X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

# Printing
print('X shape:', X.shape, 'y shape: ', y.shape)

## Example: Train 80% and Test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Importing the linear models.
from sklearn import linear_model

# Initializing the linear models.
model_lm = linear_model.LinearRegression()

# Fit the model to the training data.
Trained_model_lm = model_lm.fit(X_train, y_train)

X shape: (637, 3) y shape:  (637,)


In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

num_splits = 5
kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)

model_lm_cross_val = LinearRegression()

# Cross-validation
cross_val_scores = cross_val_score(model_lm_cross_val, X_train, y_train, cv=kf, scoring='r2')

# Train the model on the entire training set
model_lm_cross_val.fit(X_train, y_train)

# Coefficients after training
coefficients = model_lm_cross_val.coef_

print("Coefficients:")
for i, coef in enumerate(coefficients):
    print(f'Coefficient of {i}: {coef}')

Coefficients:
Coefficient of 0: 5.80763094745961
Coefficient of 1: 3.5727830378783483
Coefficient of 2: 1.9855146524211917


##### KNN-Regression

In [None]:
features = ['PerCervical', 'BMI', 'Age']

# convert it into numerical matrix
X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
outer_results = list()
best_model_knn_regres = None

# Lists to store performance metrics
mse_values = []
mae_values = []
r2_values = []
n_neighbors = []
best_params_list = {}
best_params_list['n_neighbors'] = []

for train_ix, test_ix in cv_outer.split(X):
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    cv_inner = KFold(n_splits = 3, shuffle = True, random_state = 1)
    model_knn_cross_val = KNeighborsRegressor()

    space = dict()
    space['n_neighbors'] = [13, 5, 10, 15, 20, 25, 30]

    search = GridSearchCV(model_knn_cross_val, space, scoring = 'neg_mean_absolute_error', cv = cv_inner, refit = True)
    result = search.fit(X_train, y_train)

    best_model_knn_regres = result.best_estimator_

    # Print only the parameters that were searched with GridSearchCV
    print("Best Model Parameters:")
    for param_name, param_value in result.best_params_.items():
        print(f"{param_name}: {param_value}")

        # Add the parameter to the list
        best_params_list[param_name].append(param_value)

    yhat = best_model_knn_regres.predict(X_test)

    mse = mean_squared_error(y_test, yhat)
    mae = mean_absolute_error(y_test, yhat)
    r2 = explained_variance_score(y_test, yhat)

    # Store the metrics
    mse_values.append(mse)
    mae_values.append(mae)
    r2_values.append(r2)

# summarize the estimated performance of the model
print()
print('MSE: %.3f (%.3f)' % (np.mean(mse_values), np.std(mse_values)))
print('MAE: %.3f (%.3f)' % (np.mean(mae_values), np.std(mae_values)))
print('Explained Variance (R^2): %.3f (%.3f)' % (np.mean(r2_values), np.std(r2_values)))
print()

for k in best_params_list.keys():
    # Count the occurrences of each parameter value
    param_counter = Counter(best_params_list[k])

    # Check if the parameters are floats
    if not all(isinstance(value, str) for value in best_params_list[k]):
        # If they are numerical, print mean and standard deviation
        param_values = [float(value) for value in best_params_list[k]]
        print(f"Parameter: {k}, Mean: {np.mean(param_values):.4f}, Std: {np.std(param_values):.4f}")
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")
    else:
        # If they are strings, print the most frequent values
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")

Best Model Parameters:
n_neighbors: 10
Best Model Parameters:
n_neighbors: 25
Best Model Parameters:
n_neighbors: 25
Best Model Parameters:
n_neighbors: 20
Best Model Parameters:
n_neighbors: 30
Best Model Parameters:
n_neighbors: 20
Best Model Parameters:
n_neighbors: 25
Best Model Parameters:
n_neighbors: 20
Best Model Parameters:
n_neighbors: 15
Best Model Parameters:
n_neighbors: 25

MSE: 280.130 (62.174)
MAE: 12.490 (1.154)
Explained Variance (R^2): 0.182 (0.088)

Parameter: n_neighbors, Mean: 21.5000, Std: 5.5000
Parameter: n_neighbors, Most frequent values: [(25, 4)]


##### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge  # Import Ridge model
from sklearn.preprocessing import StandardScaler

features = ['PerCervical', 'BMI', 'Age', 'Height']

# convert it into numerical matrix
scaler = StandardScaler()
X = np.array(scaler.fit_transform(df_tmp[features]))
y = np.array(df_tmp['IAH'])

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, explained_variance_score
import numpy as np

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
outer_results = list()
best_model_ridge_regr = None

# Lists to store coefficient values and performance metrics
coefficients = []
mse_values = []
mae_values = []
r2_values = []
best_params_list = {}
best_params_list['alpha'] = []

for train_ix, test_ix in cv_outer.split(X):
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
    model_ridge_cross_val = Ridge(alpha=1.0)  # You can adjust the alpha parameter as needed

    space = dict()
    space['alpha'] = [0.001, 0.01, 0.1, 1, 10, 100, 200]

    search = GridSearchCV(model_ridge_cross_val, space, scoring = 'neg_mean_squared_error', cv = cv_inner, refit = True)
    result = search.fit(X_train, y_train)

    best_model_ridge_regr = result.best_estimator_

    # Print only the parameters that were searched with GridSearchCV
    print("Best Model Parameters:")
    for param_name, param_value in result.best_params_.items():
        print(f"{param_name}: {param_value}")

        # Add the parameter to the list
        best_params_list[param_name].append(param_value)

    yhat = best_model_ridge_regr.predict(X_test)

    mse = mean_squared_error(y_test, yhat)
    mae = mean_absolute_error(y_test, yhat)
    r2 = explained_variance_score(y_test, yhat)

    # Store the coefficients
    coefficients.append(best_model_ridge_regr.coef_)
    mse_values.append(mse)
    mae_values.append(mae)
    r2_values.append(r2)

# summarize the estimated performance of the model
print()
print('MSE: %.3f (%.3f)' % (np.mean(mse_values), np.std(mse_values)))
print('MAE: %.3f (%.3f)' % (np.mean(mae_values), np.std(mae_values)))
print('Explained Variance (R^2): %.3f (%.3f)' % (np.mean(r2_values), np.std(r2_values)))
print()

# Print the coefficients of the best model
print("Coefficients:")
for i, coef in enumerate(best_model_ridge_regr.coef_):
    print(f'Coefficient of  {features[i]}: {coef}')
print()

for k in best_params_list.keys():
    # Count the occurrences of each parameter value
    param_counter = Counter(best_params_list[k])

    # Check if the parameters are floats
    if not all(isinstance(value, str) for value in best_params_list[k]):
        # If they are numerical, print mean and standard deviation
        param_values = [float(value) for value in best_params_list[k]]
        print(f"Parameter: {k}, Mean: {np.mean(param_values):.4f}, Std: {np.std(param_values):.4f}")
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")
    else:
        # If they are strings, print the most frequent values
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")

Best Model Parameters:
alpha: 100
Best Model Parameters:
alpha: 100
Best Model Parameters:
alpha: 100
Best Model Parameters:
alpha: 100
Best Model Parameters:
alpha: 10
Best Model Parameters:
alpha: 100
Best Model Parameters:
alpha: 100
Best Model Parameters:
alpha: 10
Best Model Parameters:
alpha: 100
Best Model Parameters:
alpha: 10

MSE: 274.193 (60.349)
MAE: 12.527 (1.001)
Explained Variance (R^2): 0.200 (0.107)

Coefficients:
Coefficient of  PerCervical: 5.500630619085499
Coefficient of  BMI: 4.19617153953155
Coefficient of  Age: 2.375498748269919
Coefficient of  Height: 0.510893444457592

Parameter: alpha, Mean: 73.0000, Std: 41.2432
Parameter: alpha, Most frequent values: [(100, 7)]


##### Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Example features and target variable
features = ['PerCervical', 'BMI', 'Age', 'Height']

# convert it into numerical matrix
scaler = StandardScaler()
X = np.array(scaler.fit_transform(df_tmp[features]))

# Convert data to numerical matrix
y = np.array(df_tmp['IAH'])

# Printing
print('X shape:', X.shape, 'y shape:', y.shape)

# Example: Train 80% and Test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Configure the cross-validation procedure
cv_outer = KFold(n_splits = 10, shuffle = True, random_state = 1)
outer_results = list()
best_model_lasso = None

# Lists to store coefficient values and performance metrics
coefficients = []
mse_values = []
mae_values = []
r2_values = []
best_params_list = {}
best_params_list['alpha'] = []

for train_ix, test_ix in cv_outer.split(X):
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    cv_inner = KFold(n_splits=3, shuffle = True, random_state=1)
    model_lasso_cross_val = Lasso(alpha=1.0)  # You can adjust the alpha parameter as needed

    space = dict()
    space['alpha'] = [0.01, 0.1, 1, 10]

    search = GridSearchCV(model_lasso_cross_val, space, scoring = 'neg_mean_squared_error', cv = cv_inner, refit = True)
    result = search.fit(X_train, y_train)

    best_model_lasso = result.best_estimator_

    # Print only the parameters that were searched with GridSearchCV
    print("Best Model Parameters:")
    for param_name, param_value in result.best_params_.items():
        print(f"{param_name}: {param_value}")

        # Add the parameter to the list
        best_params_list[param_name].append(param_value)

    yhat = best_model_lasso.predict(X_test)
    mse = mean_squared_error(y_test, yhat)
    mae = mean_absolute_error(y_test, yhat)
    r2 = explained_variance_score(y_test, yhat)

    # Store the coefficients
    coefficients.append(best_model_lasso.coef_)
    mse_values.append(mse)
    mae_values.append(mae)
    r2_values.append(r2)

# Summarize the estimated performance of the model
print()
print('MSE: %.3f (%.3f)' % (np.mean(mse_values), np.std(mse_values)))
print('MAE: %.3f (%.3f)' % (np.mean(mae_values), np.std(mae_values)))
print('Explained Variance (R^2): %.3f (%.3f)' % (np.mean(r2_values), np.std(r2_values)))
print()

# Print the coefficients of the best model
print("Coefficients:")
for i, coef in enumerate(best_model_lasso.coef_):
    print(f'Coefficient of  {features[i]}: {coef}')

for k in best_params_list.keys():
    # Count the occurrences of each parameter value
    param_counter = Counter(best_params_list[k])

    # Check if the parameters are floats
    if not all(isinstance(value, str) for value in best_params_list[k]):
        # If they are numerical, print mean and standard deviation
        param_values = [float(value) for value in best_params_list[k]]
        print(f"Parameter: {k}, Mean: {np.mean(param_values):.4f}, Std: {np.std(param_values):.4f}")
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")
    else:
        # If they are strings, print the most frequent values
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")

X shape: (637, 4) y shape: (637,)
Best Model Parameters:
alpha: 1
Best Model Parameters:
alpha: 0.1
Best Model Parameters:
alpha: 0.1
Best Model Parameters:
alpha: 0.1
Best Model Parameters:
alpha: 0.1
Best Model Parameters:
alpha: 0.1
Best Model Parameters:
alpha: 1
Best Model Parameters:
alpha: 0.01
Best Model Parameters:
alpha: 1
Best Model Parameters:
alpha: 0.01

MSE: 276.459 (60.768)
MAE: 12.606 (1.039)
Explained Variance (R^2): 0.193 (0.112)

Coefficients:
Coefficient of  PerCervical: 5.603618151260238
Coefficient of  BMI: 4.199433877456685
Coefficient of  Age: 2.38404108367875
Coefficient of  Height: 0.46589081098028984
Parameter: alpha, Mean: 0.3520, Std: 0.4256
Parameter: alpha, Most frequent values: [(0.1, 5)]


##### Random Forest Regressor

In [None]:
# Import necessary libraries
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Example features and target variable
features = ['PerCervical', 'BMI', 'Age']

X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

# configure the cross-validation procedure
cv_outer = KFold(n_splits = 10, shuffle = True, random_state = 1)
outer_results = list()
best_model_rnd_forest = None

# Lists to store performance metrics
mse_values = []
mae_values = []
r2_values = []
best_params_list = {}
best_params_list['max_features'] = []
best_params_list['n_estimators'] = []

for train_ix, test_ix in cv_outer.split(X):
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    cv_inner = KFold(n_splits = 3, shuffle = True, random_state = 1)
    model_random_forest = RandomForestRegressor(random_state = 1)

    space = dict()
    space['n_estimators'] = [100, 250, 500, 600]
    space['max_features'] = [1, 2, 3, 4]

    search = GridSearchCV(model_random_forest, space, scoring = 'neg_mean_squared_error', cv = cv_inner, refit = True)
    result = search.fit(X_train, y_train)

    best_model_rnd_forest = result.best_estimator_

    # Print only the parameters that were searched with GridSearchCV
    print("Best Model Parameters:")
    for param_name, param_value in result.best_params_.items():
        print(f"{param_name}: {param_value}")

        # Add the parameter to the list
        best_params_list[param_name].append(param_value)

    yhat = best_model_rnd_forest.predict(X_test)
    mse = mean_squared_error(y_test, yhat)
    mae = mean_absolute_error(y_test, yhat)
    r2 = r2_score(y_test, yhat)

    mse_values.append(mse)
    mae_values.append(mae)
    r2_values.append(r2)

# Summarize the estimated performance of the model for regression
print()
print('Mean Squared Error (MSE): %.3f (%.3f)' % (np.mean(mse_values), np.std(mse_values)))
print('Mean Absolute Error (MAE): %.3f (%.3f)' % (np.mean(mae_values), np.std(mae_values)))
print('R-squared (R^2): %.3f (%.3f)' % (np.mean(r2_values), np.std(r2_values)))
print()

for k in best_params_list.keys():
    # Count the occurrences of each parameter value
    param_counter = Counter(best_params_list[k])

    # Check if the parameters are floats
    if not all(isinstance(value, str) for value in best_params_list[k]):
        # If they are numerical, print mean and standard deviation
        param_values = [float(value) for value in best_params_list[k]]
        print(f"Parameter: {k}, Mean: {np.mean(param_values):.4f}, Std: {np.std(param_values):.4f}")
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")
    else:
        # If they are strings, print the most frequent values
        most_frequent_values = param_counter.most_common(1)  # You can adjust the number
        print(f"Parameter: {k}, Most frequent values: {most_frequent_values}")

Best Model Parameters:
max_features: 1
n_estimators: 500
Best Model Parameters:
max_features: 1
n_estimators: 600
Best Model Parameters:
max_features: 1
n_estimators: 250
Best Model Parameters:
max_features: 1
n_estimators: 600
Best Model Parameters:
max_features: 1
n_estimators: 100
Best Model Parameters:
max_features: 1
n_estimators: 100
Best Model Parameters:
max_features: 1
n_estimators: 250
Best Model Parameters:
max_features: 1
n_estimators: 600
Best Model Parameters:
max_features: 1
n_estimators: 250
Best Model Parameters:
max_features: 1
n_estimators: 500

Mean Squared Error (MSE): 297.915 (63.714)
Mean Absolute Error (MAE): 12.700 (1.269)
R-squared (R^2): 0.112 (0.121)

Parameter: max_features, Mean: 1.0000, Std: 0.0000
Parameter: max_features, Most frequent values: [(1, 10)]
Parameter: n_estimators, Mean: 375.0000, Std: 195.2562
Parameter: n_estimators, Most frequent values: [(600, 3)]


# Evaluation

#### Linear Regression:

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

# Example features and target variable
features = ['PerCervical', 'BMI', 'Age']

# Convert data to numerical matrix
X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

# Printing
print('X shape:', X.shape, 'y shape:', y.shape)



# Example: Train 80% and Test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model_lm = linear_model.LinearRegression()

model_lm.fit(X_train, y_train)

y_pred = model_lm.predict(X_test)

mae = mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2 = model_lm.score(X_test, y_test)

print("Linear Regression Metrics:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

X shape: (637, 3) y shape: (637,)
Linear Regression Metrics:
MAE: 12.22
MSE: 243.01
R-squared: 0.26


#### KNN Regression:

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Example features and target variable
features = ['PerCervical', 'BMI', 'Age']

# Convert data to numerical matrix
X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

# Printing
print('X shape:', X.shape, 'y shape:', y.shape)

# Example: Train 80% and Test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_model_knn_regres =  KNeighborsRegressor(n_neighbors = 25)
best_model_knn_regres.fit(X_train, y_train)

y_pred = best_model_knn_regres.predict(X_test)

mae = mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2 = best_model_knn_regres.score(X_test, y_test)

print("Metrics of the KNN model:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

X shape: (637, 3) y shape: (637,)
Metrics of the KNN model:
MAE: 12.11
MSE: 243.38
R-squared: 0.26


#### Ridge Regression:

In [None]:
from sklearn.linear_model import Ridge

# Example features and target variable
features = ['PerCervical', 'BMI', 'Age', 'Height']

# Convert data to numerical matrix
X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

# Printing
print('X shape:', X.shape, 'y shape:', y.shape)

# Example: Train 80% and Test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
best_model_ridge_regr = Ridge(alpha = 100)
best_model_ridge_regr.fit(X_train, y_train)

y_pred = best_model_ridge_regr.predict(X_test)

mae = mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2 = best_model_ridge_regr.score(X_test, y_test)

print("Metrics of the Ridge Regression model:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

X shape: (637, 4) y shape: (637,)
Metrics of the Ridge Regression model:
MAE: 12.35
MSE: 243.98
R-squared: 0.25


#### Lasso Regression:

In [None]:
from sklearn.linear_model import Lasso

# Example features and target variable
features = ['PerCervical', 'BMI', 'Age', 'Height']

# Convert data to numerical matrix
X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

# Printing
print('X shape:', X.shape, 'y shape:', y.shape)

# Example: Train 80% and Test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_model_lasso = Lasso(alpha = 0.1)
best_model_lasso.fit(X_train, y_train)

y_pred = best_model_lasso.predict(X_test)

mae = mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2 = best_model_lasso.score(X_test, y_test)

print("Metrics of the Lasso Regression model:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

X shape: (637, 4) y shape: (637,)
Metrics of the Lasso Regression model:
MAE: 12.24
MSE: 243.35
R-squared: 0.26


#### Random Forest Regressor:

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Example features and target variable
features = ['PerCervical', 'BMI', 'Age']

# Convert data to numerical matrix
X = np.array(df_tmp[features])
y = np.array(df_tmp['IAH'])

# Printing
print('X shape:', X.shape, 'y shape:', y.shape)

# Example: Train 80% and Test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 32)

best_model_rnd_forest = RandomForestRegressor(max_features = 1, n_estimators = 500, random_state = 1)

best_model_rnd_forest.fit(X_train, y_train)
y_pred = best_model_rnd_forest.predict(X_test)

mae = mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2 = best_model_rnd_forest.score(X_test, y_test)

print("Metrics of the Random Forest Regressor:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

X shape: (637, 3) y shape: (637,)
Metrics of the Random Forest Regressor:
MAE: 13.15
MSE: 291.75
R-squared: 0.17
