In [3]:
# 1. Load required libraries
library(caret)
library(randomForest)
library(MASS)
library(mclust)
library(glmnet)
library(mgcv)

###################
# REGRESSION PART #
###################

# Best performing features from analysis
SELECTED_FEATURES <- c("X1", "X10", "X13", "X15", "X19", "X2", "X20", "X21", 
                      "X25", "X27", "X29", "X3", "X30", "X31", "X33", "X4", 
                      "X40", "X42", "X43", "X44", "X45", "X46", "X47", "X48", 
                      "X51", "X54", "X55", "X56", "X57", "X58", "X59", "X6", 
                      "X61", "X62", "X67", "X69", "X71", "X72", "X76", "X77", 
                      "X8", "X80", "X81", "X84", "X87", "X89", "X92", "X93", 
                      "X96", "X99")

# Read and prepare regression data
X.reg <- read.table("a24_reg_app.txt")
X <- X.reg[, -ncol(X.reg)]
y <- X.reg$y

# Train GAM model with selected features
gam_formula <- as.formula(paste("y ~", paste(sprintf("s(%s, bs='cr')", SELECTED_FEATURES), collapse = " + ")))
reg <- gam(gam_formula, data = data.frame(X, y = y), method = "REML",select = TRUE)

######################
# CLASSIFICATION PART #
######################

# Utility functions for classification
get_kernel_features <- function(x, centers, sigma = 1) {
    kernels <- apply(centers, 1, function(c) {
        exp(-rowSums((sweep(x, 2, c))^2) / (2 * sigma^2))
    })
    return(kernels)
}

get_gmm_predictions <- function(model, newdata) {
    kernel_features <- get_kernel_features(newdata, model$centers, model$sigma)
    probs <- predict(model$gmm, kernel_features, what = "z")
    return(probs)
}

train_kernel_gmm <- function(data, G = 3, kernel_sigma = 1) {
    scaled_data <- scale(data)
    init_clusters <- kmeans(scaled_data, centers = G)
    centers <- init_clusters$centers
    kernel_features <- get_kernel_features(scaled_data, centers, kernel_sigma)
    gmm_model <- Mclust(kernel_features, G = G)
    return(list(
        gmm = gmm_model,
        centers = centers,
        sigma = kernel_sigma
    ))
}

get_qda_predictions <- function(model, newdata) {
    pred_probs <- predict(model, newdata)$posterior
    return(pred_probs)
}

get_rf_predictions <- function(model, newdata) {
    pred_probs <- predict(model, newdata, type = "prob")
    return(pred_probs)
}

combine_predictions <- function(qda_pred, skewed_pred, ord_pred) {
    df1 <- as.data.frame(qda_pred)
    names(df1) <- paste0("qda_", 1:ncol(df1))
    
    df2 <- as.data.frame(skewed_pred)
    names(df2) <- paste0("skewed_", 1:ncol(df2))
    
    df3 <- as.data.frame(ord_pred)
    names(df3) <- paste0("ord_", 1:ncol(df3))
    
    combined_df <- cbind(df1, df2, df3)
    return(as.data.frame(combined_df))
}

# Read and prepare classification data
X.clas = read.table("a24_clas_app.txt", header = TRUE, sep=" ")
X.clas$y <- as.factor(X.clas$y)

# Train classification model
clas <- {
    # Define variable groups
    gaussian_vars <- paste0("X", 21:45)
    skewed_vars <- paste0("X", 1:20)
    ordinal_vars <- paste0("X", 46:50)
    
    # Preprocess parameters
    preprocess_params <- preProcess(X.clas[, c(gaussian_vars, skewed_vars, ordinal_vars)], 
                                  method = c("center", "scale"))
    
    data_scaled <- X.clas
    data_scaled[, c(gaussian_vars, skewed_vars, ordinal_vars)] <- 
        predict(preprocess_params, X.clas[, c(gaussian_vars, skewed_vars, ordinal_vars)])
    
    # Train models
    skewed_matrix <- as.matrix(data_scaled[, skewed_vars])
    skewed_model <- train_kernel_gmm(skewed_matrix, G = 3)
    qda_model <- qda(y ~ ., data = data_scaled[, c("y", gaussian_vars)])
    rf_model <- randomForest(y ~ ., 
                           data = data_scaled[, c("y", ordinal_vars)],
                           ntree = 500, 
                           mtry = ceiling(sqrt(length(ordinal_vars))),
                           nodesize = 5)
    
    # Get training predictions
    skewed_pred <- get_gmm_predictions(skewed_model, skewed_matrix)
    qda_pred <- get_qda_predictions(qda_model, data_scaled[, gaussian_vars])
    rf_pred <- get_rf_predictions(rf_model, data_scaled[, ordinal_vars])
    
    # Combine predictions
    meta_features <- combine_predictions(qda_pred, skewed_pred, rf_pred)
    
    # Train meta-learner
    meta_rf <- randomForest(
        x = meta_features,
        y = X.clas$y,
        ntree = 500,
        mtry = ceiling(sqrt(ncol(meta_features))),
        nodesize = 5
    )
    
    list(
        skewed_model = skewed_model,
        qda_model = qda_model,
        rf_model = rf_model,
        meta_rf = meta_rf,
        preprocess_params = preprocess_params,
        var_groups = list(
            gaussian_vars = gaussian_vars,
            skewed_vars = skewed_vars,
            ordinal_vars = ordinal_vars
        )
    )
}

##################
# FINAL FUNCTIONS #
##################

# Regression function
regresseur <- function(test_set) {
    library(mgcv)
    predict(reg, test_set)
}

# Classification function
classifieur <- function(test_set) {
    library(caret)
    library(randomForest)
    library(MASS)
    library(mclust)
    
    # Define utility functions inside classifieur
    get_kernel_features <- function(x, centers, sigma = 1) {
        kernels <- apply(centers, 1, function(c) {
            exp(-rowSums((sweep(x, 2, c))^2) / (2 * sigma^2))
        })
        return(kernels)
    }
    
    get_gmm_predictions <- function(model, newdata) {
        kernel_features <- get_kernel_features(newdata, model$centers, model$sigma)
        probs <- predict(model$gmm, kernel_features, what = "z")
        return(probs)
    }
    
    get_qda_predictions <- function(model, newdata) {
        pred_probs <- predict(model, newdata)$posterior
        return(pred_probs)
    }
    
    get_rf_predictions <- function(model, newdata) {
        pred_probs <- predict(model, newdata, type = "prob")
        return(pred_probs)
    }
    
    combine_predictions <- function(qda_pred, skewed_pred, ord_pred) {
        df1 <- as.data.frame(qda_pred)
        names(df1) <- paste0("qda_", 1:ncol(df1))
        
        df2 <- as.data.frame(skewed_pred)
        names(df2) <- paste0("skewed_", 1:ncol(df2))
        
        df3 <- as.data.frame(ord_pred)
        names(df3) <- paste0("ord_", 1:ncol(df3))
        
        combined_df <- cbind(df1, df2, df3)
        return(as.data.frame(combined_df))
    }
    
    # Get variable groups
    gaussian_vars <- clas$var_groups$gaussian_vars
    skewed_vars <- clas$var_groups$skewed_vars
    ordinal_vars <- clas$var_groups$ordinal_vars
    
    # Scale new data
    scaled_data <- test_set
    scaled_data[, c(gaussian_vars, skewed_vars, ordinal_vars)] <- 
        predict(clas$preprocess_params, test_set[, c(gaussian_vars, skewed_vars, ordinal_vars)])
    
    # Get predictions
    skewed_matrix <- as.matrix(scaled_data[, skewed_vars])
    skewed_pred <- get_gmm_predictions(clas$skewed_model, skewed_matrix)
    qda_pred <- get_qda_predictions(clas$qda_model, scaled_data[, gaussian_vars])
    rf_pred <- get_rf_predictions(clas$rf_model, scaled_data[, ordinal_vars])
    
    # Combine predictions
    meta_features <- combine_predictions(qda_pred, skewed_pred, rf_pred)
    
    # Get final predictions
    predict(clas$meta_rf, meta_features)
}

# Save everything needed
save("clas", "reg", "classifieur", "regresseur", file = "env.Rdata")



In [5]:
# Test the saved models
rm(list = ls())
load("env.Rdata")

# Test regression
test_reg <- read.table("a24_reg_app.txt")
set.seed(123)
test_indices_reg <- sample(1:nrow(test_reg), size = round(0.3 * nrow(test_reg)))
test_data_reg <- test_reg[test_indices_reg, -ncol(test_reg)]
actual_values_reg <- test_reg[test_indices_reg, "y"]

# Make regression predictions
reg_predictions <- regresseur(test_data_reg)
cat("Regression MSE:", mean((reg_predictions - actual_values_reg)^2), "\n")

# Test classification
test_clas <- read.table("a24_clas_app.txt", header = TRUE, sep=" ")
set.seed(123)
test_indices_clas <- sample(1:nrow(test_clas), size = round(0.3 * nrow(test_clas)))
test_data_clas <- test_clas[test_indices_clas, -ncol(test_clas)]
actual_labels_clas <- test_clas[test_indices_clas, "y"]

# Make classification predictions
clas_predictions <- classifieur(test_data_clas)
cat("Classification Accuracy:", mean(clas_predictions == actual_labels_clas) * 100, "%\n")
print("Classification Confusion Matrix:")
print(table(clas_predictions, actual_labels_clas))

Regression MSE: 77.08018 
Classification Accuracy: 98.66667 %
[1] "Classification Confusion Matrix:"
                actual_labels_clas
clas_predictions  1  2  3
               1 29  0  0
               2  0 62  0
               3  1  1 57


In [2]:
# Load required packages
library(glmnet)
library(caret)
library(dplyr)
library(mgcv)

# Best performing features from previous analysis
SELECTED_FEATURES <- c("X1", "X10", "X13", "X15", "X19", "X2", "X20", "X21", 
                      "X25", "X27", "X29", "X3", "X30", "X31", "X33", "X4", 
                      "X40", "X42", "X43", "X44", "X45", "X46", "X47", "X48", 
                      "X51", "X54", "X55", "X56", "X57", "X58", "X59", "X6", 
                      "X61", "X62", "X67", "X69", "X71", "X72", "X76", "X77", 
                      "X8", "X80", "X81", "X84", "X87", "X89", "X92", "X93", 
                      "X96", "X99")

# Nested cross-validation evaluation
evaluate_models <- function(X, y, outer_folds = 5, inner_folds = 5) {
    set.seed(42)
    outer_cv <- createFolds(y, k = outer_folds, list = TRUE)
    
    # Results storage
    lasso_scores <- numeric(outer_folds)
    gam_scores <- numeric(outer_folds)
    
    cat("\nStarting nested cross-validation evaluation\n")
    
    for(fold in 1:outer_folds) {
        # Split data
        test_idx <- outer_cv[[fold]]
        X_train <- X[-test_idx, ]
        y_train <- y[-test_idx]
        X_test <- X[test_idx, ]
        y_test <- y[test_idx]
        
        # 1. LASSO
        cv_lasso <- cv.glmnet(as.matrix(X_train), y_train, alpha = 1, nfolds = inner_folds)
        lasso_pred <- predict(cv_lasso, newx = as.matrix(X_test), s = "lambda.min")
        lasso_scores[fold] <- mean((y_test - lasso_pred)^2)
        
        # 2. GAM with pre-selected features
        gam_formula <- as.formula(paste("y ~", 
            paste(sprintf("s(%s, bs='cr')", SELECTED_FEATURES), collapse = " + ")))
        gam_model <- gam(gam_formula, 
                        data = data.frame(X_train, y = y_train), 
                        method = "REML",select=TRUE)
        
        gam_pred <- predict(gam_model, 
                          newdata = data.frame(X_test))
        gam_scores[fold] <- mean((y_test - gam_pred)^2)
        
        cat(sprintf("Fold %d/%d - LASSO MSE: %.2f, GAM MSE: %.2f\n", 
                   fold, outer_folds, lasso_scores[fold], gam_scores[fold]))
        flush.console()
    }
    
    # Summarize results
    results <- data.frame(
        Model = c("LASSO", "GAM (Pre-selected features)"),
        Mean_MSE = c(mean(lasso_scores), mean(gam_scores)),
        SD_MSE = c(sd(lasso_scores), sd(gam_scores))
    )
    
    return(list(
        summary = results,
        lasso_scores = lasso_scores,
        gam_scores = gam_scores
    ))
}

# Main execution
main <- function() {
    # Read data
    X.reg <- read.table("a24_reg_app.txt")
    X <- X.reg[, -ncol(X.reg)]
    y <- X.reg$y
    
    # Run evaluation
    results <- evaluate_models(X, y)
    
    # Print results
    cat("\n=== FINAL RESULTS ===\n")
    print(results$summary)
    
    return(results)
}

# Run analysis
results <- main()


Starting nested cross-validation evaluation
Fold 1/5 - LASSO MSE: 109.04, GAM MSE: 109.91


: 

In [7]:
# Load required libraries
library(caret)
library(randomForest)
library(MASS)

# Function to evaluate classifier with nested CV
evaluate_classifier <- function(data, outer_folds=5, inner_folds=5) {
    set.seed(123)
    
    # Define variable groups
    gaussian_vars <- paste0("X", 21:45)
    ordinal_vars <- paste0("X", 46:50)
    
    # Create outer folds
    outer_cv <- createFolds(data$y, k=outer_folds, list=TRUE)
    outer_scores <- numeric(outer_folds)
    
    cat("\nStarting nested cross-validation evaluation\n")
    
    confusion_matrices <- list()
    
    for(fold in 1:outer_folds) {
        # Split data
        test_idx <- outer_cv[[fold]]
        train_data <- data[-test_idx, ]
        test_data <- data[test_idx, ]
        
        # Train QDA on continuous variables
        qda_model <- qda(y ~ ., data = train_data[, c("y", gaussian_vars)])
        
        # Train RF on ordinal variables
        rf_model <- randomForest(y ~ ., 
                               data = train_data[, c("y", ordinal_vars)],
                               ntree = 500,
                               mtry = ceiling(sqrt(length(ordinal_vars))),
                               nodesize = 5)
        
        # Get predictions for training meta-learner
        train_qda_pred <- predict(qda_model, train_data[, gaussian_vars])$posterior
        train_rf_pred <- predict(rf_model, train_data[, ordinal_vars], type = "prob")
        
        # Combine predictions for meta-learner
        meta_features_train <- cbind(train_qda_pred, train_rf_pred)
        
        # Train meta-learner
        meta_rf <- randomForest(
            x = meta_features_train,
            y = train_data$y,
            ntree = 500,
            mtry = ceiling(sqrt(ncol(meta_features_train))),
            nodesize = 5
        )
        
        # Make predictions on test set
        test_qda_pred <- predict(qda_model, test_data[, gaussian_vars])$posterior
        test_rf_pred <- predict(rf_model, test_data[, ordinal_vars], type = "prob")
        
        # Combine test predictions
        meta_features_test <- cbind(test_qda_pred, test_rf_pred)
        
        # Get final predictions
        predictions <- predict(meta_rf, meta_features_test)
        
        # Calculate accuracy
        accuracy <- mean(predictions == test_data$y)
        outer_scores[fold] <- accuracy
        
        # Store confusion matrix for this fold
        confusion_matrices[[fold]] <- table(Predicted = predictions, 
                                          Actual = test_data$y)
        
        cat(sprintf("Fold %d/%d - Accuracy: %.4f\n", 
                   fold, outer_folds, accuracy))
    }
    
    # Calculate overall metrics
    mean_accuracy <- mean(outer_scores)
    sd_accuracy <- sd(outer_scores)
    
    # Combine confusion matrices
    total_conf_matrix <- Reduce('+', confusion_matrices)
    
    # Return results
    return(list(
        fold_scores = outer_scores,
        mean_accuracy = mean_accuracy,
        sd_accuracy = sd_accuracy,
        confusion_matrix = total_conf_matrix
    ))
}

# Read and prepare data
X.clas <- read.table("a24_clas_app.txt", header = TRUE, sep=" ")
X.clas$y <- as.factor(X.clas$y)

# Run nested CV evaluation
results <- evaluate_classifier(X.clas)

# Print comprehensive results
cat("\n=== NESTED CV RESULTS ===\n")
cat("Individual fold accuracies:\n")
print(results$fold_scores)
cat(sprintf("\nMean Accuracy: %.4f (±%.4f)\n", 
            results$mean_accuracy, results$sd_accuracy))
cat("\nConfusion Matrix:\n")
print(results$confusion_matrix)



Starting nested cross-validation evaluation
Fold 1/5 - Accuracy: 0.7000
Fold 2/5 - Accuracy: 0.6337
Fold 3/5 - Accuracy: 0.7030
Fold 4/5 - Accuracy: 0.7172
Fold 5/5 - Accuracy: 0.6970

=== NESTED CV RESULTS ===
Individual fold accuracies:
[1] 0.7000000 0.6336634 0.7029703 0.7171717 0.6969697

Mean Accuracy: 0.6902 (±0.0325)

Confusion Matrix:
         Actual
Predicted   1   2   3
        1  27  14  17
        2  30 190  46
        3  25  23 128


In [10]:
# Load required libraries
library(caret)
library(randomForest)
library(MASS)
library(mgcv)

###################
# REGRESSION PART #
###################

# Best performing features from analysis
SELECTED_FEATURES <- c("X1", "X10", "X13", "X15", "X19", "X2", "X20", "X21", 
                      "X25", "X27", "X29", "X3", "X30", "X31", "X33", "X4", 
                      "X40", "X42", "X43", "X44", "X45", "X46", "X47", "X48", 
                      "X51", "X54", "X55", "X56", "X57", "X58", "X59", "X6", 
                      "X61", "X62", "X67", "X69", "X71", "X72", "X76", "X77", 
                      "X8", "X80", "X81", "X84", "X87", "X89", "X92", "X93", 
                      "X96", "X99")

# Read and prepare regression data
X.reg <- read.table("a24_reg_app.txt")
X <- X.reg[, -ncol(X.reg)]
y <- X.reg$y

# Train GAM model with selected features
gam_formula <- as.formula(paste("y ~", paste(sprintf("s(%s, bs='cr')", SELECTED_FEATURES), collapse = " + ")))
reg <- gam(gam_formula, data = data.frame(X, y = y), method = "REML",select=TRUE)

######################
# CLASSIFICATION PART #
######################

# Read classification data
X.clas <- read.table("a24_clas_app.txt", header = TRUE, sep=" ")
X.clas$y <- as.factor(X.clas$y)

# Define variable groups
gaussian_vars <- paste0("X", 21:45)
ordinal_vars <- paste0("X", 46:50)

# Train QDA on gaussian variables
qda_model <- qda(y ~ ., data = X.clas[, c("y", gaussian_vars)])

# Train RF on ordinal variables
rf_model <- randomForest(y ~ ., 
                        data = X.clas[, c("y", ordinal_vars)],
                        ntree = 500,
                        mtry = ceiling(sqrt(length(ordinal_vars))),
                        nodesize = 5)

# Get predictions for training meta-learner
train_qda_pred <- predict(qda_model, X.clas[, gaussian_vars])$posterior
train_rf_pred <- predict(rf_model, X.clas[, ordinal_vars], type = "prob")

# Combine predictions for meta-learner
meta_features <- cbind(train_qda_pred, train_rf_pred)

# Train meta-learner
meta_rf <- randomForest(
    x = meta_features,
    y = X.clas$y,
    ntree = 500,
    mtry = ceiling(sqrt(ncol(meta_features))),
    nodesize = 5
)

# Store classification model and parameters
clas <- list(
    qda_model = qda_model,
    rf_model = rf_model,
    meta_rf = meta_rf,
    var_groups = list(
        gaussian_vars = gaussian_vars,
        ordinal_vars = ordinal_vars
    )
)

##################
# FINAL FUNCTIONS #
##################

# Regression function
regresseur <- function(test_set) {
    library(mgcv)
    predict(reg, test_set)
}

# Classification function
classifieur <- function(test_set) {
    library(caret)
    library(randomForest)
    library(MASS)
    
    # Get variable groups
    gaussian_vars <- clas$var_groups$gaussian_vars
    ordinal_vars <- clas$var_groups$ordinal_vars
    
    # Get predictions from both models
    qda_pred <- predict(clas$qda_model, test_set[, gaussian_vars])$posterior
    rf_pred <- predict(clas$rf_model, test_set[, ordinal_vars], type = "prob")
    
    # Combine predictions for meta-learner
    meta_features <- cbind(qda_pred, rf_pred)
    
    # Get final predictions
    predict(clas$meta_rf, meta_features)
}

# Save everything needed
save("clas", "reg", "classifieur", "regresseur", file = "env.Rdata")


In [11]:
# Test the saved models
rm(list = ls())
load("env.Rdata")

# Test regression
test_reg <- read.table("a24_reg_app.txt")
set.seed(123)
test_indices_reg <- sample(1:nrow(test_reg), size = round(0.1 * nrow(test_reg)))
test_data_reg <- test_reg[test_indices_reg, -ncol(test_reg)]
actual_values_reg <- test_reg[test_indices_reg, "y"]

# Make regression predictions
reg_predictions <- regresseur(test_data_reg)
cat("Regression MSE:", mean((reg_predictions - actual_values_reg)^2), "\n")

# Test classification
test_clas <- read.table("a24_clas_app.txt", header = TRUE, sep=" ")
set.seed(123)
test_indices_clas <- sample(1:nrow(test_clas), size = round(0.1 * nrow(test_clas)))
test_data_clas <- test_clas[test_indices_clas, -ncol(test_clas)]
actual_labels_clas <- test_clas[test_indices_clas, "y"]

# Make classification predictions
clas_predictions <- classifieur(test_data_clas)
cat("Classification Accuracy:", mean(clas_predictions == actual_labels_clas) * 100, "%\n")
print("Classification Confusion Matrix:")
print(table(clas_predictions, actual_labels_clas))

Regression MSE: 77.71846 
Classification Accuracy: 88 %
[1] "Classification Confusion Matrix:"
                actual_labels_clas
clas_predictions  1  2  3
               1 10  1  1
               2  0 23  4
               3  0  0 11
