In [13]:
library(caret)
library(ggplot2)
library(lattice)
set.seed(123)
data <- read.table("a24_reg_app.txt", header = TRUE, sep = " ")

cross_validate_linear_regression <- function(data, target) {
    cv_errors <- numeric()
    for (k in 3:20) {
        train_control <- trainControl(method = "cv", number = k)
        model <- train(as.formula(paste(target, "~ .")),
                                     data = data,
                                     method = "lm",
                                     trControl = train_control)
        cv_errors[k - 2] <- mean(model$resample$RMSE)
    }
    best_k <- which.min(cv_errors) + 2
    final_train_control <- trainControl(method = "cv", number = best_k)
    final_model <- train(as.formula(paste(target, "~ .")),
                                             data = data,
                                             method = "lm",
                                             trControl = final_train_control)
    
    return(list(model = final_model, best_k = best_k))
}
result <- cross_validate_linear_regression(data, "y")
print(result$best_k)
print(result$model)

[1] 14
Linear Regression 

500 samples
100 predictors

No pre-processing
Resampling: Cross-Validated (14 fold) 
Summary of sample sizes: 464, 464, 465, 465, 464, 464, ... 
Resampling results:

  RMSE     Rsquared   MAE     
  11.9659  0.9568383  9.521029

Tuning parameter 'intercept' was held constant at a value of TRUE


In [15]:
library(caret)
library(ggplot2)
library(lattice)
set.seed(123)

data <- read.table("a24_reg_app.txt", header = TRUE, sep = " ")

cross_validate_knn_regression <- function(data, target) {
    # Step 1: Find the best number of folds k for cross-validation
    cv_errors <- numeric()
    for (k in 3:20) {
        train_control <- trainControl(method = "cv", number = k)
        
        # Tune k' (number of neighbors) within a range using cross-validation for each fold number
        model <- train(as.formula(paste(target, "~ .")),
                       data = data,
                       method = "knn",
                       tuneGrid = data.frame(k = 3:25),  # Try neighbors from 1 to 20
                       trControl = train_control)
        
        # Store the lowest RMSE for each k
        cv_errors[k - 2] <- min(model$results$RMSE)
    }
    
    # Step 2: Select the best number of folds k
    best_k <- which.min(cv_errors) + 2
    
    # Step 3: Using best_k, find the best number of neighbors k' for the final model
    final_train_control <- trainControl(method = "cv", number = best_k)
    final_model <- train(as.formula(paste(target, "~ .")),
                         data = data,
                         method = "knn",
                         tuneGrid = data.frame(k = 3:25),  # Tune k' from 1 to 20
                         trControl = final_train_control)
    
    # Get the best number of neighbors
    best_k_prime <- final_model$bestTune$k
    
    # Step 4: Calculate R-squared for the final model
    final_r_squared <- max(final_model$results$Rsquared)
    
    return(list(model = final_model, best_k = best_k, best_k_prime = best_k_prime, R_squared = final_r_squared))
}

# Run the function and print results
result <- cross_validate_knn_regression(data, "y")
print(result$best_k)         # Best number of folds for CV
print(result$best_k_prime)    # Best number of neighbors
print(result)       # Final model R-squared

[1] 16
[1] 25
$model
k-Nearest Neighbors 

500 samples
100 predictors

No pre-processing
Resampling: Cross-Validated (16 fold) 
Summary of sample sizes: 471, 468, 470, 468, 468, 470, ... 
Resampling results across tuning parameters:

  k   RMSE      Rsquared   MAE     
   3  54.57824  0.1611553  43.23926
   4  52.35680  0.1964717  41.27100
   5  51.68935  0.2066666  40.79558
   6  50.68158  0.2415582  40.11442
   7  50.04156  0.2644493  39.26169
   8  49.46606  0.2913562  39.15450
   9  49.57175  0.2952360  39.12756
  10  49.54900  0.3031670  39.19183
  11  49.27341  0.3197940  39.21594
  12  49.20791  0.3303331  39.29317
  13  48.81528  0.3612977  38.99966
  14  48.45992  0.3915724  38.49296
  15  48.37063  0.3985982  38.36468
  16  48.39756  0.4088595  38.28730
  17  48.38141  0.4153013  38.31843
  18  48.19710  0.4372807  38.19778
  19  48.52003  0.4282907  38.50323
  20  48.34222  0.4422701  38.43515
  21  48.43653  0.4452800  38.57103
  22  48.43711  0.4549084  38.49626
  23  48.4

In [18]:
library(caret)
library(rpart)      # For decision tree models
library(ggplot2)
library(lattice)
set.seed(123)

data <- read.table("a24_reg_app.txt", header = TRUE, sep = " ")

cross_validate_decision_tree <- function(data, target) {
    cv_errors <- numeric()
    
    # Step 1: Find the best number of folds (k) by cross-validation
    for (k in 3:10) {  # Reduced upper bound to avoid excessive folding on small datasets
        train_control <- trainControl(method = "cv", 
                                      number = k)
        
        # Train the decision tree model with the current number of folds
        model <- train(as.formula(paste(target, "~ .")),
                       data = data,
                       method = "rpart",
                       trControl = train_control)
        
        # Check for any NA values in the RMSE to avoid missing values in results
        if (!any(is.na(model$resample$RMSE))) {
            cv_errors[k - 2] <- mean(model$resample$RMSE)
        } else {
            cv_errors[k - 2] <- NA  # Skip storing if NA is found
        }
    }
    
    # Step 2: Filter out any NA values from cv_errors to avoid issues
    cv_errors <- na.omit(cv_errors)
    best_k <- which.min(cv_errors) + 2
    
    # Step 3: Train the final decision tree model using the best number of folds
    final_train_control <- trainControl(method = "cv", 
                                        number = best_k)
    
    final_model <- train(as.formula(paste(target, "~ .")),
                         data = data,
                         method = "rpart",
                         trControl = final_train_control)
    
    # Step 4: Retrieve the R-squared value for the final model
    final_r_squared <- max(final_model$results$Rsquared, na.rm = TRUE)  # Handle any NA values in R-squared
    
    return(list(model = final_model, best_k = best_k, R_squared = final_r_squared))
}

# Run the function and print results
result <- cross_validate_decision_tree(data, "y")
print(result$best_k)         # Best number of folds for CV
print(result$R_squared)      # Final model R-squared
print(result$model)          # Final model details


“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”


[1] 8
[1] 0.1643124
CART 

500 samples
100 predictors

No pre-processing
Resampling: Cross-Validated (8 fold) 
Summary of sample sizes: 436, 437, 439, 440, 438, 436, ... 
Resampling results across tuning parameters:

  cp          RMSE      Rsquared    MAE     
  0.05532061  53.50864  0.16431237  42.50041
  0.05978122  53.90575  0.14821406  43.27662
  0.14324339  57.58948  0.05434224  46.31702

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.05532061.


In [21]:
library(caret)
library(randomForest) # For random forest models
library(ggplot2)
library(lattice)
set.seed(123)

data <- read.table("a24_reg_app.txt", header = TRUE, sep = " ")

cross_validate_random_forest <- function(data, target) {
    cv_errors <- numeric()
    
    # Step 1: Find the best number of folds (k) by cross-validation
    for (k in 3:10) {  # Limiting k to avoid small sample sizes per fold
        train_control <- trainControl(method = "cv",
                                      number = k) 
        grid_rf <- expand.grid(mtry = c(1:3))  # Hyperparameter grid for mtry
        # Train the random forest model with the current number of folds
        model <- train(as.formula(paste(target, "~ .")),
                       data = data,
                       method = "rf",
                       metric = "Rsquared",
                       trControl = train_control,
                       tuneGrid = grid_rf)  # Adjust tuneLength as needed for hyperparameter tuning
        
        # Check for any NA values in the RMSE to avoid missing values in results
        if (!any(is.na(model$resample$RMSE))) {
            cv_errors[k - 2] <- mean(model$resample$RMSE)
        } else {
            cv_errors[k - 2] <- NA  # Skip storing if NA is found
        }
    }
    
    # Step 2: Filter out any NA values from cv_errors to avoid issues
    cv_errors <- na.omit(cv_errors)
    best_k <- which.min(cv_errors) + 2
    
    # Step 3: Train the final random forest model using the best number of folds
    final_train_control <- trainControl(method = "cv", 
                                        number = best_k)
    grid_rf <- expand.grid(mtry = c(1:3))
    final_model <- train(as.formula(paste(target, "~ .")),
                         data = data,
                         method = "rf",
                         trControl = final_train_control,
                         tuneGrid = grid_rf)  # Adjust tuneLength as needed for hyperparameter tuning
    
    # Step 4: Retrieve the R-squared value for the final model
    final_r_squared <- max(final_model$results$Rsquared, na.rm = TRUE)  # Handle any NA values in R-squared
    
    return(list(model = final_model, best_k = best_k, R_squared = final_r_squared))
}

# Run the function and print results
result <- cross_validate_random_forest(data, "y")
print(result$best_k)         # Best number of folds for CV
print(result$R_squared)      # Final model R-squared
print(result$model)          # Final model details


[1] 7
[1] 0.6643114
Random Forest 

500 samples
100 predictors

No pre-processing
Resampling: Cross-Validated (7 fold) 
Summary of sample sizes: 428, 429, 429, 429, 428, 428, ... 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared   MAE     
  1     53.50857  0.5538439  42.56899
  2     52.27357  0.6020620  41.54268
  3     51.13092  0.6643114  40.57656

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 3.


## Stop ICI, cette partie n'est pas encore re travailler et n'est pas encore fonctionnelle

In [22]:
cross_validate_cubic_spline <- function(data, target) {
    predictor <- names(data)[names(data) != target][1]  # Assuming the first predictor variable
    
    # Step 1: Find the best number of folds k for cross-validation
    cv_errors <- numeric()
    for (k in 3:20) {
        train_control <- trainControl(method = "cv", number = k)
        
        # Tune the number of knots by trying different numbers of knots (1 to 10 in this example)
        model <- train(as.formula(paste(target, "~ ns(", predictor, ", knots)")),
                       data = data,
                       method = "lm",
                       trControl = train_control,
                       tuneGrid = data.frame(knots = 1:10))  # Adjust number of knots
        
        # Store the lowest RMSE for each k
        cv_errors[k - 2] <- min(model$results$RMSE)
    }
    
    # Step 2: Select the best number of folds k
    best_k <- which.min(cv_errors) + 2
    
    # Step 3: Using best_k, find the optimal number of knots
    final_train_control <- trainControl(method = "cv", number = best_k)
    final_model <- train(as.formula(paste(target, "~ ns(", predictor, ", knots)")),
                         data = data,
                         method = "lm",
                         trControl = final_train_control,
                         tuneGrid = data.frame(knots = 1:10))  # Adjust number of knots
    
    # Get the best number of knots
    best_knots <- final_model$bestTune$knots
    
    # Step 4: Calculate R-squared for the final model
    final_r_squared <- max(final_model$results$Rsquared)
    
    return(list(model = final_model, best_k = best_k, best_knots = best_knots, R_squared = final_r_squared))
}

# Run the function and print results
result <- cross_validate_cubic_spline(data, "y")
print(result$best_k)         # Best number of folds for CV
print(result$best_knots)     # Optimal number of knots
print(result$R_squared)      # Final model R-squared

ERROR: Error in df - 1L: non-numeric argument to binary operator


In [23]:

library(caret)
library(MASS)       # For QDA and LDA
library(splines)    # For Splines
library(class)      # For KNN
library(glmnet)     # For Ridge, Lasso, and Elastic Net Regression
library(rpart)      # For Decision Trees
library(Matrix)
set.seed(123)
data <- read.table("a24_reg_app.txt", header = TRUE, sep = " ")

cross_validate_models <- function(data, target, model_type) {
    cv_errors <- numeric()
    best_k <- 0
    final_model <- NULL

    for (k in 3:20) {
        train_control <- trainControl(method = "cv", number = k)

        # Select method based on model_type
        if (model_type == "linear_regression") {
            model <- train(as.formula(paste(target, "~ .")),
                           data = data,
                           method = "lm",
                           trControl = train_control)

        } else if (model_type == "ridge_regression") {
            formula <- as.formula(paste(target, "~ ."))
            model <- train(formula,
                           data = data,
                           method = "glmnet",
                           trControl = train_control,
                           tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, length = 100)))

        } else if (model_type == "lasso_regression") {
            formula <- as.formula(paste(target, "~ ."))
            model <- train(formula,
                           data = data,
                           method = "glmnet",
                           trControl = train_control,
                           tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, length = 100)))

        } else if (model_type == "elastic_net_regression") {
            formula <- as.formula(paste(target, "~ ."))
            model <- train(formula,
                           data = data,
                           method = "glmnet",
                           trControl = train_control,
                           tuneGrid = expand.grid(alpha = seq(0, 1, length = 10), lambda = seq(0, 1, length = 100)))

        } else if (model_type == "splines") {
            model <- train(as.formula(paste(target, "~ bs(. , degree=3)")),
                           data = data,
                           method = "lm",
                           trControl = train_control)

        } else if (model_type == "knn") {
            model <- train(as.formula(paste(target, "~ .")),
                           data = data,
                           method = "knn",
                           trControl = train_control,
                           tuneGrid = expand.grid(k = k))  # k here is used in KNN

        } else {
            stop("Invalid model type. Choose from 'linear_regression', 'polynomial_regression', 'ridge_regression', 'lasso_regression', 'elastic_net_regression', 'splines', or 'knn'.")
        }

        # Calculate cross-validation error based on model type
        if (model_type %in% c("linear_regression", "polynomial_regression", "ridge_regression", "lasso_regression", "elastic_net_regression", "splines")) {
            cv_errors[k - 2] <- mean(model$resample$RMSE)
        } else if (model_type == "knn") {
            cv_errors[k - 2] <- 1 - mean(model$resample$Accuracy)
        }
    }

    # Find the value of k that minimizes the cross-validation error
    best_k <- which.min(cv_errors) + 2

    # Retrain the model with the optimal k
    final_train_control <- trainControl(method = "cv", number = best_k)

    if (model_type == "linear_regression") {
        final_model <- train(as.formula(paste(target, "~ .")),
                             data = data,
                             method = "lm",
                             trControl = final_train_control)

    } else if (model_type == "ridge_regression") {
        formula <- as.formula(paste(target, "~ ."))
        final_model <- train(formula,
                             data = data,
                             method = "glmnet",
                             trControl = final_train_control,
                             tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, length = 100)))

    } else if (model_type == "lasso_regression") {
        formula <- as.formula(paste(target, "~ ."))
        final_model <- train(formula,
                             data = data,
                             method = "glmnet",
                             trControl = final_train_control,
                             tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, length = 100)))

    } else if (model_type == "elastic_net_regression") {
        formula <- as.formula(paste(target, "~ ."))
        final_model <- train(formula,
                             data = data,
                             method = "glmnet",
                             trControl = final_train_control,
                             tuneGrid = expand.grid(alpha = seq(0, 1, length = 10), lambda = seq(0, 1, length = 100)))

    } else if (model_type == "splines") {
        final_model <- train(as.formula(paste(target, "~ bs(. , degree=3)")),
                             data = data,
                             method = "lm",
                             trControl = final_train_control)

    } else if (model_type == "knn") {
        final_model <- train(as.formula(paste(target, "~ .")),
                             data = data,
                             method = "knn",
                             trControl = final_train_control,
                             tuneGrid = expand.grid(k = best_k))
    }

    # Return the final model and the best k
    return(list(model = final_model, best_k = best_k))
}

# Example calls:
# For Linear Regression
result_lr <- cross_validate_models(data, "y", "linear_regression")
print(result_lr$best_k)
print(result_lr$model)

# For Ridge Regression
result_rr <- cross_validate_models(data, "y", "ridge_regression")
print(result_rr$best_k)
print(result_rr$model)

# For Lasso Regression
result_lar <- cross_validate_models(data, "y", "lasso_regression")
print(result_lar$best_k)
print(result_lar$model)

# For Elastic Net Regression
result_enr <- cross_validate_models(data, "y", "elastic_net_regression")
print(result_enr$best_k)
print(result_enr$model)

# For Splines
result_splines <- cross_validate_models(data, "y", "splines")
print(result_splines$best_k)
print(result_splines$model)

# For KNN
result_knn <- cross_validate_models(data, "y", "knn")
print(result_knn$best_k)
print(result_knn$model)

ERROR: Error in library(glmnet): there is no package called ‘glmnet’
