In [1]:
# Package management
required_packages <- c("glmnet", "caret", "dplyr", "MASS", "mgcv", "earth")
for(package in required_packages) {
    if (!require(package, character.only = TRUE)) {
        install.packages(package)
        library(package, character.only = TRUE)
    }
}

# Helper function to extract important features
get_important_features <- function(model_type, model, threshold = 0.01) {
    if(model_type == "lasso") {
        coefs <- as.matrix(coef(model, s="lambda.min"))
        features <- names(which(abs(coefs[-1,1]) > threshold))
        return(features)
    } else if(model_type == "glm") {
        coefs <- coef(model)
        features <- names(which(abs(coefs[-1]) > threshold))
        return(features)
    }
}

# Lasso with nested CV
evaluate_lasso_nested <- function(X, y, outer_folds=5, inner_folds=5) {
    cat("\nStarting Lasso evaluation\n")
    flush.console()
    
    set.seed(42)
    X <- as.matrix(X)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    
    outer_scores <- numeric(outer_folds)
    feature_counts <- numeric(outer_folds)
    important_features_list <- vector("list", outer_folds)
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        X_train <- X[-outer_test_idx, ]
        y_train <- y[-outer_test_idx]
        X_test <- X[outer_test_idx, ]
        y_test <- y[outer_test_idx]
        
        cv_fit <- cv.glmnet(X_train, y_train, alpha=1, nfolds=inner_folds)
        final_model <- glmnet(X_train, y_train, alpha=1, lambda=cv_fit$lambda.min)
        
        predictions <- predict(final_model, X_test, s="lambda.min")
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        important_features_list[[outer_fold]] <- get_important_features("lasso", final_model)
        feature_counts[outer_fold] <- length(important_features_list[[outer_fold]])
        
        cat(sprintf("Lasso Outer fold %d/%d: MSE = %.2f\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold]))
        flush.console()
    }
    
    feature_freq <- table(unlist(important_features_list))
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores,
        mean_features = mean(feature_counts),
        sd_features = sd(feature_counts),
        feature_frequency = feature_freq
    ))
}

# Backward selection with nested CV
evaluate_backward_nested <- function(X, y, outer_folds=5) {
    cat("\nStarting Backward Selection evaluation\n")
    flush.console()
    
    set.seed(42)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    
    outer_scores <- numeric(outer_folds)
    feature_counts <- numeric(outer_folds)
    important_features_list <- vector("list", outer_folds)
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        train_data <- data.frame(X[-outer_test_idx,], y=y[-outer_test_idx])
        test_data <- data.frame(X[outer_test_idx,])
        y_test <- y[outer_test_idx]
        
        full_formula <- as.formula(paste("y ~", paste(names(X), collapse = " + ")))
        model <- step(glm(full_formula, data=train_data),
                     direction="backward",
                     k=log(nrow(train_data)),
                     trace=0)
        
        predictions <- predict(model, newdata=test_data)
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        important_features_list[[outer_fold]] <- get_important_features("glm", model)
        feature_counts[outer_fold] <- length(important_features_list[[outer_fold]])
        
        cat(sprintf("Backward Outer fold %d/%d: MSE = %.2f\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold]))
        flush.console()
    }
    
    feature_freq <- table(unlist(important_features_list))
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores,
        mean_features = mean(feature_counts),
        sd_features = sd(feature_counts),
        feature_frequency = feature_freq
    ))
}

# Feature selection function
get_selected_features <- function(lasso_results, backward_results,
                                selection_type = "tier1",
                                fold_threshold_tier1 = 4,     
                                fold_threshold_tier2 = 3,     
                                top_corr_count = 10) {    
    
    # Top correlated variables (hardcoded based on your data)
    top_corr_vars <- c("X92", "X43", "X57", "X8", "X76", "X44", "X81", "X29", "X46", "X21")
    
    # Get features by frequency for each method and remove intercept if present
    lasso_features <- names(lasso_results$feature_frequency)
    lasso_features <- lasso_features[lasso_features != "(Intercept)"]
    backward_features <- names(backward_results$feature_frequency)
    backward_features <- backward_features[backward_features != "(Intercept)"]
    
    # Get Tier 1 features
    lasso_tier1 <- lasso_features[lasso_results$feature_frequency[lasso_features] >= fold_threshold_tier1]
    backward_tier1 <- backward_features[backward_results$feature_frequency[backward_features] >= fold_threshold_tier1]
    
    cat("\nTier 1 Summary:")
    cat("\nNumber of Lasso Tier 1 features:", length(lasso_tier1))
    cat("\nNumber of Backward Tier 1 features:", length(backward_tier1))
    flush.console()
    
    tier1_features <- unique(c(lasso_tier1, backward_tier1))
    cat("\nTotal unique Tier 1 features:", length(tier1_features))
    cat("\nTier 1 features:", paste(sort(tier1_features), collapse=", "), "\n")
    # Get Tier 2 features
    lasso_tier2 <- lasso_features[lasso_results$feature_frequency[lasso_features] >= fold_threshold_tier2 & 
                                 lasso_results$feature_frequency[lasso_features] < fold_threshold_tier1]
    backward_tier2 <- backward_features[backward_results$feature_frequency[backward_features] >= fold_threshold_tier2 & 
                                      backward_results$feature_frequency[backward_features] < fold_threshold_tier1]
    
    cat("\nTier 2 Summary:")
    cat("\nNumber of Lasso Tier 2 features:", length(lasso_tier2))
    cat("\nNumber of Backward Tier 2 features:", length(backward_tier2))
    flush.console()
    tier2_features <- unique(c(lasso_tier2, backward_tier2))
    cat("\nTotal unique Tier 2 features:", length(tier2_features))
    cat("\nTier 2 features:", paste(sort(tier2_features), collapse=", "), "\n")
    selected_features <- switch(selection_type,
        "tier1" = tier1_features,
        "tier2" = tier2_features,
        "intersection" = unique(intersect(tier1_features, tier2_features)),
        "union" = unique(union(tier1_features, tier2_features))
    )
    
    selected_features <- unique(c(selected_features, top_corr_vars))
    
    cat("\nFinal Selection Summary:")
    cat(sprintf("\nSelection type: %s", selection_type))
    cat(sprintf("\nNumber of selected features: %d", length(selected_features)))
    cat("\nSelected features:", paste(sort(selected_features), collapse=", "), "\n")
    flush.console()
    
    return(selected_features)
}

# GAM with scaling and descaling
evaluate_gam_nested <- function(X, y, selected_features, outer_folds=5) {
    cat("\nStarting GAM evaluation\n")
    flush.console()
    
    set.seed(42)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    outer_scores <- numeric(outer_folds)
    
    for(outer_fold in 1:outer_folds) {
        # Split data
        outer_test_idx <- outer_cv[[outer_fold]]
        X_train <- X[selected_features][-outer_test_idx,]
        y_train <- y[-outer_test_idx]
        X_test <- X[selected_features][outer_test_idx,]
        y_test <- y[outer_test_idx]
        
        # Scale features and response
        X_scale <- scale(X_train)
        X_test_scale <- scale(X_test, center=attr(X_scale, "scaled:center"), 
                            scale=attr(X_scale, "scaled:scale"))
        y_scale <- scale(y_train)
        
        # Create scaled training data
        train_data <- data.frame(X_scale)
        train_data$y <- scale(y_train)
        
        # Create GAM formula
        gam_terms <- paste(sprintf("s(%s, bs='cr')", selected_features), collapse=" + ")
        gam_formula <- as.formula(paste("y ~", gam_terms))
        
        # Fit model on scaled data
        model <- gam(gam_formula, data=train_data, method="REML")
        
        # Make predictions and descale
        predictions_scaled <- predict(model, newdata=data.frame(X_test_scale))
        predictions <- predictions_scaled * attr(y_scale, "scaled:scale") + 
                      attr(y_scale, "scaled:center")
        
        # Calculate MSE
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        cat(sprintf("GAM Outer fold %d/%d: MSE = %.2f\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold]))
        flush.console()
    }
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores
    ))
}

# MARS with scaling and descaling
evaluate_mars_nested <- function(X, y, selected_features, outer_folds=5) {
    cat("\nStarting MARS evaluation\n")
    flush.console()
    
    set.seed(42)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    outer_scores <- numeric(outer_folds)
    
    param_grid <- expand.grid(
        degree = c(1, 2),
        nprune = seq(10, 30, by=5),
        thresh = c(0.001, 0.01)
    )
    
    for(outer_fold in 1:outer_folds) {
        # Split data
        outer_test_idx <- outer_cv[[outer_fold]]
        X_train <- X[selected_features][-outer_test_idx,]
        y_train <- y[-outer_test_idx]
        X_test <- X[selected_features][outer_test_idx,]
        y_test <- y[outer_test_idx]
        
        # Scale features and response
        X_scale <- scale(X_train)
        X_test_scale <- scale(X_test, center=attr(X_scale, "scaled:center"), 
                            scale=attr(X_scale, "scaled:scale"))
        y_scale <- scale(y_train)
        
        best_mse <- Inf
        for(i in 1:nrow(param_grid)) {
            model <- earth(x=X_scale, y=y_scale,
                         degree=param_grid$degree[i],
                         nprune=param_grid$nprune[i],
                         thresh=param_grid$thresh[i],
                         minspan=5,
                         pmethod="backward")
            
            # Make predictions and descale
            predictions_scaled <- predict(model, X_test_scale)
            predictions <- predictions_scaled * attr(y_scale, "scaled:scale") + 
                          attr(y_scale, "scaled:center")
            
            mse <- mean((y_test - predictions)^2)
            
            if(mse < best_mse) {
                best_mse <- mse
                best_model <- model
            }
        }
        
        outer_scores[outer_fold] <- best_mse
        cat(sprintf("MARS Outer fold %d/%d: MSE = %.2f\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold]))
        flush.console()
    }
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores
    ))
}


# Read data
X.reg <- read.table("a24_reg_app.txt")
X <- X.reg[, -ncol(X.reg)]
y <- X.reg$y

# Phase 1: Feature Selection
lasso_results <- evaluate_lasso_nested(X, y)
backward_results <- evaluate_backward_nested(X, y)

# Get selected features for each method
selection_methods <- c("intersection","tier2","tier1", "union")
selected_features_list <- lapply(selection_methods, function(method) {
    get_selected_features(lasso_results, backward_results, selection_type=method)
})
names(selected_features_list) <- selection_methods

# Phase 2: Final Models
results <- list()
for(method in selection_methods) {
    cat(sprintf("\nEvaluating models with %s features:\n", method))
    flush.console()
    features <- selected_features_list[[method]]
    
    results[[method]] <- list(
        gam = evaluate_gam_nested(X, y, features),
        mars = evaluate_mars_nested(X, y, features)
    )
}

# Print final summary
cat("\nFinal Results Summary:\n")
for(method in selection_methods) {
    cat(sprintf("\n%s Selection Method:", method))
    cat(sprintf("\nGAM - MSE: %.2f (±%.2f)", 
               results[[method]]$gam$mean_mse,
               results[[method]]$gam$sd_mse))
    cat(sprintf("\nMARS - MSE: %.2f (±%.2f)", 
               results[[method]]$mars$mean_mse,
               results[[method]]$mars$sd_mse))
}

Le chargement a nécessité le package : glmnet

Le chargement a nécessité le package : Matrix

Loaded glmnet 4.1-8

Le chargement a nécessité le package : caret

Le chargement a nécessité le package : ggplot2

Le chargement a nécessité le package : lattice

Le chargement a nécessité le package : dplyr


Attachement du package : 'dplyr'


Les objets suivants sont masqués depuis 'package:stats':

    filter, lag


Les objets suivants sont masqués depuis 'package:base':

    intersect, setdiff, setequal, union


Le chargement a nécessité le package : MASS


Attachement du package : 'MASS'


L'objet suivant est masqué depuis 'package:dplyr':

    select


Le chargement a nécessité le package : mgcv

Le chargement a nécessité le package : nlme


Attachement du package : 'nlme'


L'objet suivant est masqué depuis 'package:dplyr':

    collapse


This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

Le chargement a nécessité le package : earth

"le package 'earth' a été compilé avec l


Starting Lasso evaluation
Lasso Outer fold 1/5: MSE = 109.07
Lasso Outer fold 2/5: MSE = 145.72
Lasso Outer fold 3/5: MSE = 148.38
Lasso Outer fold 4/5: MSE = 128.17
Lasso Outer fold 5/5: MSE = 141.67

Starting Backward Selection evaluation
Backward Outer fold 1/5: MSE = 106.16
Backward Outer fold 2/5: MSE = 146.82
Backward Outer fold 3/5: MSE = 157.84
Backward Outer fold 4/5: MSE = 125.04
Backward Outer fold 5/5: MSE = 135.06

Tier 1 Summary:
Number of Lasso Tier 1 features: 63
Number of Backward Tier 1 features: 42
Total unique Tier 1 features: 63
Tier 1 features: X1, X10, X11, X13, X14, X15, X19, X2, X20, X21, X25, X27, X29, X3, X30, X31, X33, X36, X39, X4, X40, X42, X43, X44, X45, X46, X47, X48, X51, X52, X54, X55, X56, X57, X58, X59, X6, X61, X62, X65, X67, X68, X69, X71, X72, X74, X75, X76, X77, X8, X80, X81, X84, X87, X89, X90, X92, X93, X95, X96, X97, X98, X99 

Tier 2 Summary:
Number of Lasso Tier 2 features: 13
Number of Backward Tier 2 features: 1
Total unique Tier 2 featur

In [2]:
# Package management
required_packages <- c("glmnet", "caret", "dplyr", "MASS", "mgcv", "earth", "corrplot")
for(package in required_packages) {
    if (!require(package, character.only = TRUE)) {
        install.packages(package)
        library(package, character.only = TRUE)
    }
}

# Helper function to extract important features with coefficients
get_important_features <- function(model_type, model, threshold = 0.01) {
    if(model_type == "lasso") {
        coefs <- as.matrix(coef(model, s="lambda.min"))
        features <- names(which(abs(coefs[-1,1]) > threshold))
        return(list(
            features = features,
            coefficients = coefs[-1,1][abs(coefs[-1,1]) > threshold]
        ))
    } else if(model_type == "glm") {
        coefs <- coef(model)
        features <- names(which(abs(coefs[-1]) > threshold))
        return(list(
            features = features,
            coefficients = coefs[-1][abs(coefs[-1]) > threshold]
        ))
    }
}

# Function to calculate correlations with target
get_top_correlations <- function(X, y, top_n = 10) {
    correlations <- sapply(X, function(x) cor(x, y, method = "pearson"))
    spearman_cors <- sapply(X, function(x) cor(x, y, method = "spearman"))
    
    cor_df <- data.frame(
        variable = names(correlations),
        pearson_cor = correlations,
        spearman_cor = spearman_cors,
        abs_pearson = abs(correlations),
        abs_spearman = abs(spearman_cors)
    )
    
    cor_df <- cor_df[order(-cor_df$abs_pearson), ]
    return(list(
        top_n = cor_df[1:top_n, ],
        all_correlations = cor_df
    ))
}

# Lasso with nested CV and detailed feature tracking
evaluate_lasso_nested <- function(X, y, outer_folds=5, inner_folds=5) {
    cat("\nStarting Lasso evaluation\n")
    flush.console()
    
    set.seed(42)
    X <- as.matrix(X)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    
    outer_scores <- numeric(outer_folds)
    feature_info_list <- vector("list", outer_folds)
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        X_train <- X[-outer_test_idx, ]
        y_train <- y[-outer_test_idx]
        X_test <- X[outer_test_idx, ]
        y_test <- y[outer_test_idx]
        
        cv_fit <- cv.glmnet(X_train, y_train, alpha=1, nfolds=inner_folds)
        final_model <- glmnet(X_train, y_train, alpha=1, lambda=cv_fit$lambda.min)
        
        predictions <- predict(final_model, X_test, s="lambda.min")
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        feature_info <- get_important_features("lasso", final_model)
        feature_info_list[[outer_fold]] <- feature_info
        
        cat(sprintf("Lasso Outer fold %d/%d: MSE = %.2f, Features = %d\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold],
                   length(feature_info$features)))
        flush.console()
    }
    
    # Aggregate feature information
    all_features <- unique(unlist(lapply(feature_info_list, function(x) x$features)))
    feature_matrix <- matrix(0, nrow=length(all_features), ncol=outer_folds)
    rownames(feature_matrix) <- all_features
    
    for(i in 1:outer_folds) {
        feature_matrix[feature_info_list[[i]]$features, i] <- 
            abs(feature_info_list[[i]]$coefficients)
    }
    
    feature_summary <- data.frame(
        feature = all_features,
        frequency = rowSums(feature_matrix > 0),
        mean_coef = rowMeans(feature_matrix),
        sd_coef = apply(feature_matrix, 1, sd)
    )
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores,
        feature_summary = feature_summary
    ))
}

# Backward selection with nested CV and detailed feature tracking
evaluate_backward_nested <- function(X, y, outer_folds=5) {
    cat("\nStarting Backward Selection evaluation\n")
    flush.console()
    
    set.seed(42)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    
    outer_scores <- numeric(outer_folds)
    feature_info_list <- vector("list", outer_folds)
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        train_data <- data.frame(X[-outer_test_idx,], y=y[-outer_test_idx])
        test_data <- data.frame(X[outer_test_idx,])
        y_test <- y[outer_test_idx]
        
        full_formula <- as.formula(paste("y ~", paste(names(X), collapse = " + ")))
        model <- step(glm(full_formula, data=train_data),
                     direction="backward",
                     k=log(nrow(train_data)),
                     trace=0)
        
        predictions <- predict(model, newdata=test_data)
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        feature_info <- get_important_features("glm", model)
        feature_info_list[[outer_fold]] <- feature_info
        
        cat(sprintf("Backward Outer fold %d/%d: MSE = %.2f, Features = %d\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold],
                   length(feature_info$features)))
        flush.console()
    }
    
    # Aggregate feature information
    all_features <- unique(unlist(lapply(feature_info_list, function(x) x$features)))
    feature_matrix <- matrix(0, nrow=length(all_features), ncol=outer_folds)
    rownames(feature_matrix) <- all_features
    
    for(i in 1:outer_folds) {
        feature_matrix[feature_info_list[[i]]$features, i] <- 
            abs(feature_info_list[[i]]$coefficients)
    }
    
    feature_summary <- data.frame(
        feature = all_features,
        frequency = rowSums(feature_matrix > 0),
        mean_coef = rowMeans(feature_matrix),
        sd_coef = apply(feature_matrix, 1, sd)
    )
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores,
        feature_summary = feature_summary
    ))
}

# Optimized feature selection
get_optimized_features <- function(lasso_results, backward_results, cor_results, 
                                 selection_methods = c("tier1plus", "weighted", "stable")) {
    # Get correlation information
    top_cors <- cor_results$top_n$variable
    
    results <- list()
    
    # Tier1+ selection
    if("tier1plus" %in% selection_methods) {
        # Variables in all 5 folds
        lasso_perfect <- lasso_results$feature_summary$feature[
            lasso_results$feature_summary$frequency == 5]
        backward_perfect <- backward_results$feature_summary$feature[
            backward_results$feature_summary$frequency == 5]
        
        # Variables in both methods with freq >= 4
        lasso_freq4 <- lasso_results$feature_summary$feature[
            lasso_results$feature_summary$frequency >= 4]
        backward_freq4 <- backward_results$feature_summary$feature[
            backward_results$feature_summary$frequency >= 4]
        both_freq4 <- intersect(lasso_freq4, backward_freq4)
        
        tier1plus <- unique(c(lasso_perfect, backward_perfect, both_freq4, top_cors))
        results$tier1plus <- tier1plus
    }
    
    # Weighted approach
    if("weighted" %in% selection_methods) {
        # Combine information from all sources
        all_features <- unique(c(
            lasso_results$feature_summary$feature,
            backward_results$feature_summary$feature
        ))
        
        weighted_scores <- data.frame(
            feature = all_features,
            score = 0
        )
        
        # Add scores based on different criteria
        for(feature in all_features) {
            # Lasso frequency and coefficient
            if(feature %in% lasso_results$feature_summary$feature) {
                lasso_idx <- which(lasso_results$feature_summary$feature == feature)
                weighted_scores$score[weighted_scores$feature == feature] <- 
                    weighted_scores$score[weighted_scores$feature == feature] +
                    lasso_results$feature_summary$frequency[lasso_idx] * 0.2 +
                    abs(lasso_results$feature_summary$mean_coef[lasso_idx]) * 0.3
            }
            
            # Backward frequency and coefficient
            if(feature %in% backward_results$feature_summary$feature) {
                backward_idx <- which(backward_results$feature_summary$feature == feature)
                weighted_scores$score[weighted_scores$feature == feature] <- 
                    weighted_scores$score[weighted_scores$feature == feature] +
                    backward_results$feature_summary$frequency[backward_idx] * 0.2 +
                    abs(backward_results$feature_summary$mean_coef[backward_idx]) * 0.3
            }
            
            # Correlation bonus
            if(feature %in% top_cors) {
                cor_idx <- which(cor_results$all_correlations$variable == feature)
                weighted_scores$score[weighted_scores$feature == feature] <- 
                    weighted_scores$score[weighted_scores$feature == feature] +
                    abs(cor_results$all_correlations$pearson_cor[cor_idx]) * 0.5
            }
        }
        
        # Select top features based on weighted score
        weighted_scores <- weighted_scores[order(-weighted_scores$score), ]
        results$weighted <- weighted_scores$feature[1:min(50, nrow(weighted_scores))]
    }
    
    # Stable selection (based on cross-validation performance)
    if("stable" %in% selection_methods) {
        # Get features from better performing folds
        lasso_good_folds <- which(lasso_results$all_scores < median(lasso_results$all_scores))
        backward_good_folds <- which(backward_results$all_scores < median(backward_results$all_scores))
        
        # Select features that appear in majority of good folds
        stable_features <- unique(c(
            lasso_results$feature_summary$feature[
                lasso_results$feature_summary$frequency >= length(lasso_good_folds)],
            backward_results$feature_summary$feature[
                backward_results$feature_summary$frequency >= length(backward_good_folds)]
        ))
        
        results$stable <- stable_features
    }
    
    return(results)
}

# GAM evaluation
evaluate_gam_nested <- function(X, y, selected_features, outer_folds=5) {
    cat(sprintf("\nStarting GAM evaluation with %d features\n", length(selected_features)))
    flush.console()
    
    set.seed(42)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    outer_scores <- numeric(outer_folds)
    
    gam_terms <- paste(sprintf("s(%s, bs='cr')", selected_features), collapse=" + ")
    gam_formula <- as.formula(paste("y ~", gam_terms))
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        train_data <- data.frame(X[selected_features][-outer_test_idx,], 
                               y=y[-outer_test_idx])
        test_data <- data.frame(X[selected_features][outer_test_idx,])
        y_test <- y[outer_test_idx]
        
        model <- gam(gam_formula, data=train_data, method="REML")
        predictions <- predict(model, newdata=test_data)
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        cat(sprintf("GAM Outer fold %d/%d: MSE = %.2f\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold]))
        flush.console()
    }
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores
    ))
}

# Main execution
main <- function() {
    # Read data
    X.reg <- read.table("a24_reg_app.txt")
    cat("Data loaded successfully\n")
    X <- X.reg[, -ncol(X.reg)]
    y <- X.reg$y
    
    # Get correlation information
    cor_results <- get_top_correlations(X, y)
    cat("\nTop 10 correlated variables with target:\n")
    print(cor_results$top_n)
    
    # Phase 1: Initial feature selection
    lasso_results <- evaluate_lasso_nested(X, y)
    backward_results <- evaluate_backward_nested(X, y)
    
    # Phase 2: Optimized feature selection
    selected_features <- get_optimized_features(lasso_results, backward_results, cor_results)
    
    # Phase 3: Evaluate with different feature sets
    results <- list()
    
    # Print detailed summary
    cat("\n=== COMPREHENSIVE ANALYSIS RESULTS ===\n")
    
    # 1. Baseline Results
    cat("\n1. BASELINE MODELS")
    cat("\n-----------------")
    cat(sprintf("\nLasso - Mean MSE: %.2f (±%.2f)", 
                lasso_results$mean_mse, lasso_results$sd_mse))
    cat("\nLasso feature selection stability:")
    print(table(lasso_results$feature_summary$frequency))
    
    cat(sprintf("\nBackward - Mean MSE: %.2f (±%.2f)", 
                backward_results$mean_mse, backward_results$sd_mse))
    cat("\nBackward feature selection stability:")
    print(table(backward_results$feature_summary$frequency))
    
    # 2. Optimized Results
    cat("\n\n2. OPTIMIZED FEATURE SELECTION RESULTS")
    cat("\n-------------------------------------")
    for(method in names(selected_features)) {
        cat(sprintf("\n\n%s Selection Method:", toupper(method)))
        cat(sprintf("\nNumber of features: %d", length(selected_features[[method]])))
        cat("\nSelected features:", paste(sort(selected_features[[method]]), collapse=", "))
        
        # Evaluate GAM with this feature set
        results[[method]] <- evaluate_gam_nested(X, y, selected_features[[method]])
        
        cat(sprintf("\nGAM Performance - MSE: %.2f (±%.2f)", 
                   results[[method]]$mean_mse, results[[method]]$sd_mse))
        
        # Feature overlap analysis
        cat("\nFeature overlap with top correlations:", 
            length(intersect(selected_features[[method]], cor_results$top_n$variable)))
    }
    
    # 3. Comparative Analysis
    cat("\n\n3. COMPARATIVE ANALYSIS")
    cat("\n----------------------")
    
    # Create performance summary
    performance_summary <- data.frame(
        Method = character(),
        Features = numeric(),
        MSE = numeric(),
        SD = numeric(),
        stringsAsFactors = FALSE
    )
    
    # Add baseline methods
    performance_summary <- rbind(performance_summary, 
        data.frame(
            Method = "Lasso",
            Features = nrow(lasso_results$feature_summary),
            MSE = lasso_results$mean_mse,
            SD = lasso_results$sd_mse
        )
    )
    
    performance_summary <- rbind(performance_summary,
        data.frame(
            Method = "Backward",
            Features = nrow(backward_results$feature_summary),
            MSE = backward_results$mean_mse,
            SD = backward_results$sd_mse
        )
    )
    
    # Add optimized methods
    for(method in names(results)) {
        performance_summary <- rbind(performance_summary,
            data.frame(
                Method = paste("GAM", method),
                Features = length(selected_features[[method]]),
                MSE = results[[method]]$mean_mse,
                SD = results[[method]]$sd_mse
            )
        )
    }
    
    # Sort by MSE
    performance_summary <- performance_summary[order(performance_summary$MSE), ]
    
    cat("\nPerformance Summary (sorted by MSE):\n")
    print(performance_summary)
    
    # 4. Recommendations
    cat("\n\n4. RECOMMENDATIONS")
    cat("\n----------------")
    best_method <- performance_summary$Method[1]
    best_features <- selected_features[[gsub("GAM ", "", best_method)]]
    
    cat(sprintf("\nBest performing method: %s", best_method))
    cat(sprintf("\nNumber of features: %d", length(best_features)))
    cat(sprintf("\nMean MSE: %.2f (±%.2f)", 
                performance_summary$MSE[1], 
                performance_summary$SD[1]))
    cat("\nRecommended features:", paste(sort(best_features), collapse=", "))
    
    # Return all results for further analysis if needed
    return(list(
        baseline = list(
            lasso = lasso_results,
            backward = backward_results
        ),
        optimized = results,
        selected_features = selected_features,
        performance_summary = performance_summary,
        correlations = cor_results
    ))
}

# Run the analysis
results <- main()

Le chargement a nécessité le package : corrplot

corrplot 0.95 loaded



Data loaded successfully

Top 10 correlated variables with target:
    variable pearson_cor spearman_cor abs_pearson abs_spearman
X92      X92   0.3775479    0.3652928   0.3775479    0.3652928
X43      X43   0.3232837    0.3230231   0.3232837    0.3230231
X57      X57  -0.2718024   -0.2617451   0.2718024    0.2617451
X8        X8   0.2676847    0.2717730   0.2676847    0.2717730
X76      X76  -0.2495047   -0.2459881   0.2495047    0.2459881
X44      X44  -0.2283222   -0.2406929   0.2283222    0.2406929
X81      X81  -0.2148017   -0.2068499   0.2148017    0.2068499
X29      X29   0.2060082    0.1970644   0.2060082    0.1970644
X46      X46  -0.1957796   -0.1916278   0.1957796    0.1916278
X21      X21   0.1897579    0.1726961   0.1897579    0.1726961

Starting Lasso evaluation
Lasso Outer fold 1/5: MSE = 109.07, Features = 82
Lasso Outer fold 2/5: MSE = 145.72, Features = 82
Lasso Outer fold 3/5: MSE = 148.38, Features = 68
Lasso Outer fold 4/5: MSE = 128.17, Features = 80
Lasso Outer f

In [1]:
# Package management
required_packages <- c("glmnet", "caret", "dplyr", "MASS", "mgcv", "earth", "corrplot")
for(package in required_packages) {
    if (!require(package, character.only = TRUE)) {
        install.packages(package)
        library(package, character.only = TRUE)
    }
}

# Helper function to extract important features with coefficients
get_important_features <- function(model_type, model, threshold = 0.01) {
    if(model_type == "lasso") {
        coefs <- as.matrix(coef(model, s="lambda.min"))
        features <- names(which(abs(coefs[-1,1]) > threshold))
        return(list(
            features = features,
            coefficients = coefs[-1,1][abs(coefs[-1,1]) > threshold]
        ))
    } else if(model_type == "glm") {
        coefs <- coef(model)
        features <- names(which(abs(coefs[-1]) > threshold))
        return(list(
            features = features,
            coefficients = coefs[-1][abs(coefs[-1]) > threshold]
        ))
    }
}

# Function to calculate correlations with target
get_top_correlations <- function(X, y, top_n = 10) {
    correlations <- sapply(X, function(x) cor(x, y, method = "pearson"))
    spearman_cors <- sapply(X, function(x) cor(x, y, method = "spearman"))
    
    cor_df <- data.frame(
        variable = names(correlations),
        pearson_cor = correlations,
        spearman_cor = spearman_cors,
        abs_pearson = abs(correlations),
        abs_spearman = abs(spearman_cors)
    )
    
    cor_df <- cor_df[order(-cor_df$abs_pearson), ]
    return(list(
        top_n = cor_df[1:top_n, ],
        all_correlations = cor_df
    ))
}

# Lasso with nested CV
evaluate_lasso_nested <- function(X, y, outer_folds=5, inner_folds=5) {
    cat("\nStarting Lasso evaluation\n")
    flush.console()
    
    set.seed(42)
    X <- as.matrix(X)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    
    outer_scores <- numeric(outer_folds)
    feature_info_list <- vector("list", outer_folds)
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        X_train <- X[-outer_test_idx, ]
        y_train <- y[-outer_test_idx]
        X_test <- X[outer_test_idx, ]
        y_test <- y[outer_test_idx]
        
        cv_fit <- cv.glmnet(X_train, y_train, alpha=1, nfolds=inner_folds)
        predictions <- predict(cv_fit, X_test, s="lambda.min")
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        feature_info <- get_important_features("lasso", cv_fit)
        feature_info_list[[outer_fold]] <- feature_info
        
        cat(sprintf("Lasso Outer fold %d/%d: MSE = %.2f\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold]))
    }
        flush.console()
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores,
        feature_info = feature_info_list
    ))
}

# Backward selection with nested CV
evaluate_backward_nested <- function(X, y, outer_folds=5) {
    cat("\nStarting Backward Selection evaluation\n")
    flush.console()
    
    set.seed(42)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    
    outer_scores <- numeric(outer_folds)
    feature_info_list <- vector("list", outer_folds)
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        train_data <- data.frame(X[-outer_test_idx,], y=y[-outer_test_idx])
        test_data <- data.frame(X[outer_test_idx,])
        y_test <- y[outer_test_idx]
        
        full_formula <- as.formula(paste("y ~", paste(names(X), collapse = " + ")))
        model <- step(glm(full_formula, data=train_data),
                     direction="backward",
                     k=log(nrow(train_data)),
                     trace=0)
        
        predictions <- predict(model, newdata=test_data)
        outer_scores[outer_fold] <- mean((y_test - predictions)^2)
        
        feature_info <- get_important_features("glm", model)
        feature_info_list[[outer_fold]] <- feature_info
        
        cat(sprintf("Backward Outer fold %d/%d: MSE = %.2f\n", 
                   outer_fold, outer_folds, outer_scores[outer_fold]))

    }
        flush.console()
    
    return(list(
        mean_mse = mean(outer_scores),
        sd_mse = sd(outer_scores),
        all_scores = outer_scores,
        feature_info = feature_info_list
    ))
}

# Function to get optimized features
get_optimized_features <- function(lasso_results, backward_results, cor_results) {
    # Get correlation information
    top_cors <- cor_results$top_n$variable
    
    # Get features that appear in both methods consistently
    lasso_features <- unique(unlist(lapply(lasso_results$feature_info, 
                                         function(x) x$features)))
    backward_features <- unique(unlist(lapply(backward_results$feature_info, 
                                            function(x) x$features)))
    
    # Combine features
    all_features <- unique(c(lasso_features, backward_features, top_cors))
    
    return(all_features)
}

# Function to evaluate GAM with different settings
evaluate_gam_variants <- function(X, y, selected_features, outer_folds=5) {
    set.seed(42)
    outer_cv <- createFolds(y, k=outer_folds, list=TRUE)
    
    # Results storage
    results <- list(
        cubic = numeric(outer_folds),
        default_select = numeric(outer_folds)
    )
    
    # Create formulas
    cubic_terms <- paste(sprintf("s(%s, bs='cr')", selected_features), collapse=" + ")
    default_terms <- paste(sprintf("s(%s)", selected_features), collapse=" + ")
    
    cubic_formula <- as.formula(paste("y ~", cubic_terms))
    default_formula <- as.formula(paste("y ~", default_terms))
    
    cat("\nStarting GAM evaluations with different settings\n")
    
    for(outer_fold in 1:outer_folds) {
        outer_test_idx <- outer_cv[[outer_fold]]
        train_data <- data.frame(X[selected_features][-outer_test_idx,], 
                               y=y[-outer_test_idx])
        test_data <- data.frame(X[selected_features][outer_test_idx,])
        y_test <- y[outer_test_idx]
        
        # Fit cubic spline model
        model_cubic <- gam(cubic_formula, data=train_data, method="REML")
        pred_cubic <- predict(model_cubic, newdata=test_data)
        results$cubic[outer_fold] <- mean((y_test - pred_cubic)^2)
        
        # Fit default spline with selection
        model_default <- gam(default_formula, data=train_data, method="REML", select=TRUE)
        pred_default <- predict(model_default, newdata=test_data)
        results$default_select[outer_fold] <- mean((y_test - pred_default)^2)
        
        cat(sprintf("\nFold %d/%d:", outer_fold, outer_folds))
        cat(sprintf("\n  Cubic Spline MSE: %.2f", results$cubic[outer_fold]))
        cat(sprintf("\n  Default+Select MSE: %.2f", results$default_select[outer_fold]))
        flush.console() 
    }
    
    return(list(
        cubic = list(
            mean_mse = mean(results$cubic),
            sd_mse = sd(results$cubic),
            all_scores = results$cubic
        ),
        default_select = list(
            mean_mse = mean(results$default_select),
            sd_mse = sd(results$default_select),
            all_scores = results$default_select
        )
    ))
}

# Main execution
main <- function() {
    # Read data
    X.reg <- read.table("a24_reg_app.txt")
    cat("Data loaded successfully\n")
    X <- X.reg[, -ncol(X.reg)]
    y <- X.reg$y
    
    # Get correlation information
    cor_results <- get_top_correlations(X, y)
    cat("\nTop 10 correlated variables with target:\n")
    print(cor_results$top_n)
    
    # Get baseline results
    lasso_results <- evaluate_lasso_nested(X, y)
    backward_results <- evaluate_backward_nested(X, y)
    
    # Get optimized features
    selected_features <- get_optimized_features(lasso_results, backward_results, cor_results)
    
    # Evaluate GAM variants
    gam_results <- evaluate_gam_variants(X, y, selected_features)
    
    # Print comprehensive results
    cat("\n=== COMPREHENSIVE ANALYSIS RESULTS ===\n")
    
    # Create performance summary
    performance_summary <- data.frame(
        Method = c("Lasso", "Backward", "GAM (cubic)", "GAM (default+select)"),
        MSE = c(
            lasso_results$mean_mse,
            backward_results$mean_mse,
            gam_results$cubic$mean_mse,
            gam_results$default_select$mean_mse
        ),
        SD = c(
            lasso_results$sd_mse,
            backward_results$sd_mse,
            gam_results$cubic$sd_mse,
            gam_results$default_select$sd_mse
        ),
        Features = c(
            length(unique(unlist(lapply(lasso_results$feature_info, function(x) x$features)))),
            length(unique(unlist(lapply(backward_results$feature_info, function(x) x$features)))),
            length(selected_features),
            length(selected_features)
        )
    )
    
    # Sort by MSE
    performance_summary <- performance_summary[order(performance_summary$MSE), ]
    
    cat("\nPerformance Summary (sorted by MSE):\n")
    print(performance_summary)
    
    return(list(
        performance_summary = performance_summary,
        selected_features = selected_features,
        all_results = list(
            lasso = lasso_results,
            backward = backward_results,
            gam = gam_results
        )
    ))
}

# Run the analysis
results <- main()

Le chargement a nécessité le package : glmnet

Le chargement a nécessité le package : Matrix

Loaded glmnet 4.1-8

Le chargement a nécessité le package : caret

Le chargement a nécessité le package : ggplot2

Le chargement a nécessité le package : lattice

Le chargement a nécessité le package : dplyr


Attachement du package : 'dplyr'


Les objets suivants sont masqués depuis 'package:stats':

    filter, lag


Les objets suivants sont masqués depuis 'package:base':

    intersect, setdiff, setequal, union


Le chargement a nécessité le package : MASS


Attachement du package : 'MASS'


L'objet suivant est masqué depuis 'package:dplyr':

    select


Le chargement a nécessité le package : mgcv

Le chargement a nécessité le package : nlme


Attachement du package : 'nlme'


L'objet suivant est masqué depuis 'package:dplyr':

    collapse


This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

Le chargement a nécessité le package : earth

"le package 'earth' a été compilé avec l

Data loaded successfully

Top 10 correlated variables with target:
    variable pearson_cor spearman_cor abs_pearson abs_spearman
X92      X92   0.3775479    0.3652928   0.3775479    0.3652928
X43      X43   0.3232837    0.3230231   0.3232837    0.3230231
X57      X57  -0.2718024   -0.2617451   0.2718024    0.2617451
X8        X8   0.2676847    0.2717730   0.2676847    0.2717730
X76      X76  -0.2495047   -0.2459881   0.2495047    0.2459881
X44      X44  -0.2283222   -0.2406929   0.2283222    0.2406929
X81      X81  -0.2148017   -0.2068499   0.2148017    0.2068499
X29      X29   0.2060082    0.1970644   0.2060082    0.1970644
X46      X46  -0.1957796   -0.1916278   0.1957796    0.1916278
X21      X21   0.1897579    0.1726961   0.1897579    0.1726961

Starting Lasso evaluation
Lasso Outer fold 1/5: MSE = 109.04
Lasso Outer fold 2/5: MSE = 145.76
Lasso Outer fold 3/5: MSE = 148.43
Lasso Outer fold 4/5: MSE = 128.18
Lasso Outer fold 5/5: MSE = 141.68

Starting Backward Selection evaluatio

: 