In [21]:
library(dplyr)
library(ggplot2)
library(caret)
library(keras)
library(tensorflow)
library(randomForest)

"le package 'randomForest' a été compilé avec la version R 4.3.3"
randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attachement du package : 'randomForest'


L'objet suivant est masqué depuis 'package:ggplot2':

    margin


L'objet suivant est masqué depuis 'package:dplyr':

    combine




## Step 1: Data Loading

In [2]:
# Function to process training data
process_train_data <- function(file_path) {
  train_data <- read.csv(file_path)
  # Drop the specified columns
  train_data <- train_data %>% 
    select(-target_min, -target_max, -target_variance, -target_count, -Place_ID.X.Date)
  return(train_data)
}

# Function to process test data
process_test_data <- function(file_path) {
  test_data <- read.csv(file_path)
  # Drop the specified column
  test_data <- test_data %>%
    select(-"Place_ID.X.Date")
  return(test_data)
}

## Step 2: Data Exploration
Omitted for now as it has been done in the previous file

## Step 3: Missing Value Imputation

In [3]:
# Function to remove columns with more than 30% missing values
clean_data <- function(data, threshold = 30) {
  # Calculate the percentage of missing values for each column
  missing_percent <- colSums(is.na(data)) / nrow(data) * 100
    
  # Identify columns to retain (those with less than the threshold percentage of missing data)
  cols_to_keep <- names(data)[missing_percent < threshold]
  
  # Identify columns to drop
  cols_to_drop <- names(data)[missing_percent >= threshold]
  
  # Select these columns from the data
  data_cleaned <- data[, cols_to_keep]
  
  # Return a list containing the cleaned data and the names of the dropped columns
  return(list(cleaned_data = data_cleaned, dropped_columns = cols_to_drop))
}

#df_no30 <- clean_data(train_data)
#dim(df_no30$cleaned_data)
#head(df_no30$dropped_columns)


In [4]:
# Function to drop columns based on a list of column names, will be used to remove all columsn that will have been dropped from train data frame
drop_columns <- function(data, columns_to_drop) {
  # Drop the specified columns
  data_cleaned <- data %>%
    select(-all_of(columns_to_drop))
  return(data_cleaned)
}

In [5]:
#zero_to_na <- function(data) {
  # Apply transformation only to numeric columns
#  data_no0 <- data
#  data_no0[data == 0] <- NA
#  return(data_no0)
#}

zero_to_na <- function(data) {
  # Apply transformation only to numeric columns
  data <- data %>%
    mutate(across(where(is.numeric), ~ replace(., . == 0, NA)))
  return(data)
}

### Feature Imputation using the mean

In [6]:
# Function to impute missing values with the mean of each column
impute_with_mean <- function(data) {
  data <- as.data.frame(data)
  for (col in names(data)) {
    if (is.numeric(data[[col]])) {
      mean_value <- mean(data[[col]], na.rm = TRUE)
      data[[col]][is.na(data[[col]])] <- mean_value
    }
  }
  return(data)
}

### Normalisation

In [7]:
# function for min-max scaling and applying it to all numeric columns
min_max_scale_data <- function(data) {
  # Function to perform min-max scaling
  min_max_scale <- function(x) {
    return ((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
  }
  
  # Apply min-max scaling to all numeric columns
  data_min_max_scaled <- data %>%
    mutate(across(where(is.numeric), min_max_scale))
  
  return(data_min_max_scaled)
}

In [8]:
# Randomise Stations for test and train
get_train_test <- function(data,rep) {
    unique_place_ids <- unique(data$Place_ID)
    shuffled_place_ids <- sample(unique_place_ids)
    unique_place_ids <- unique(data$Place_ID)
    shuffled_place_ids <- sample(unique_place_ids)
    num_train <- round(length(shuffled_place_ids) * rep)
    train_ids <- shuffled_place_ids[1:num_train]
    validation_ids <- shuffled_place_ids[(num_train + 1):length(shuffled_place_ids)]
    train <- data[data$Place_ID %in% train_ids, ] %>% 
      select(-"Place_ID")
    validation <- data[data$Place_ID %in% validation_ids, ] %>% 
      select(-"Place_ID")
    return(list(train = train, validation = validation))
}

### Preparation of the data

## Model Selection and Model training

### Neural Network

In [18]:
# Define a function to create and train the neural network
train_neural_network <- function(train_data, test_data, target_column, epochs = 100, batch_size = 32) {
  # Split the training data into training and validation sets
  set.seed(123)  # for reproducibility
  train_index <- createDataPartition(y = train_data[[target_column]], p = 0.8, list = FALSE)
  train_set <- train_data[train_index, ]
  val_set <- train_data[-train_index, ]
  
  # Extract features and target
  train_x <- as.matrix(train_set %>% select(-all_of(target_column)))
  train_y <- as.matrix(train_set[[target_column]])
  val_x <- as.matrix(val_set %>% select(-all_of(target_column)))
  val_y <- as.matrix(val_set[[target_column]])
  
  # Define the neural network model
  model <- keras_model_sequential() %>%
    layer_dense(units = 64, activation = 'relu', input_shape = ncol(train_x)) %>%
    layer_dense(units = 32, activation = 'relu') %>%
    layer_dense(units = 1)
  
  # Compile the model
  model %>% compile(
    loss = 'mean_squared_error',
    optimizer = optimizer_adam(),
    metrics = c('mean_absolute_error')
  )
  
  # Train the model
  history <- model %>% fit(
    train_x, train_y,
    epochs = epochs,
    batch_size = batch_size,
    validation_data = list(val_x, val_y)
  )
  
  # Evaluate the model on the test data
  test_x <- as.matrix(test_data %>% select(-all_of(target_column)))
  test_y <- as.matrix(test_data[[target_column]])
  evaluation <- model %>% evaluate(test_x, test_y)
  
  # Make predictions on the test data
  predictions <- model %>% predict(test_x)
  
  # Return the model, evaluation metrics, and predictions
  return(list(model = model, evaluation = evaluation, predictions = predictions))
}


In [14]:
train_df <- process_train_data("Train.csv")
test_df <- process_test_data("Test.csv")

train_df <- zero_to_na(train_df)
test_df <- zero_to_na(test_df)

train_df <- clean_data(train_df)
test_df <- drop_columns(test_df, train_df$dropped_columns )

train_df <- impute_with_mean(train_df$cleaned_data)
test_df <- impute_with_mean(test_df)

train_df <- impute_with_mean(train_df)
test_df <- impute_with_mean(test_df)

#skip normalisation
pre_train <- get_train_test(test_df, 0,8)
train_df <- pre_train$train
validation_df <- pre_train$validation

In [26]:
head(test_df)

Unnamed: 0_level_0,Date,Place_ID,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,⋯,L3_AER_AI_solar_zenith_angle,L3_SO2_SO2_column_number_density,L3_SO2_SO2_column_number_density_amf,L3_SO2_SO2_slant_column_number_density,L3_SO2_absorbing_aerosol_index,L3_SO2_cloud_fraction,L3_SO2_sensor_azimuth_angle,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2020-01-02,0OS9LVX,11.6,30.2,0.00409,14.65682,3.956377,0.7126049,5.338188e-05,0.0001081872,⋯,22.94202,0.0002207098,0.7844364,0.0001835919,-0.1404579,0.032070799,68.09937,1.445658,-95.98498,22.94202
2,2020-01-03,0OS9LVX,18.3,42.9,0.00595,15.02654,4.23043,0.6618921,5.044761e-05,0.0001090962,⋯,18.53952,3.386942e-05,0.6789883,1.353507e-05,-0.8427128,0.040803427,75.93681,34.641758,-95.01491,18.53912
3,2020-01-04,0OS9LVX,17.6,41.3,0.0059,15.51104,5.245728,1.6405591,5.035383e-05,0.0001344593,⋯,14.14082,0.0001839346,0.6677681,0.0001219163,-0.7167696,0.007112971,75.55244,55.872276,-94.01542,14.14082
4,2020-01-05,0OS9LVX,15.01195,53.1,0.00709,14.44186,5.454001,-0.1905322,5.499153e-05,0.0001546277,⋯,32.73075,0.0002008748,0.6967723,0.0001328754,-0.7301036,0.062076021,-102.28513,59.174188,-97.2476,32.73055
5,2020-01-06,0OS9LVX,9.7,71.6,0.00808,11.8963,3.511787,-0.2794409,5.508028e-05,0.0001308539,⋯,28.32053,9.338975e-05,0.6773054,6.526173e-05,-0.1083527,0.042776815,-102.13396,40.925873,-96.05727,28.32053
6,2020-01-07,0OS9LVX,13.4,69.3,0.00828156,12.74487,3.945603,-0.2409253,5.809714e-05,0.0001141703,⋯,23.90786,8.664031e-05,0.7463363,4.455912e-05,0.25067,0.03998701,-102.89442,10.836973,-94.79964,23.90786


In [19]:
nn_model <- train_neural_network(train_df, validation_df, "target")

ERROR: Only input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: <Sequential name=sequential, built=False> (of type <class 'keras.src.models.sequential.Sequential'>)

### Random Forest

In [22]:
rf_model <- randomForest(target ~ ., data = train_df, ntree = 200)

In [24]:
predictions <- predict(rf_model, train_df)
rmse <- sqrt(mean((predictions - train_df$target)^2))
print(paste("Root Mean Squared Error (RMSE):", rmse))

[1] "Root Mean Squared Error (RMSE): 12.4870265015593"


Apply to test data

In [27]:
# Define a function to apply the model to new data and save predictions
apply_model_to_data <- function(model, new_data, sample_submission_path, output_path) {
  # Read the sample submission file to get the structure
  sample_submission <- read.csv(sample_submission_path)
  
  # Extract features from new data (assume Place_ID column exists and exclude it)
  new_data_x <- new_data %>% select(-Place_ID)
  
  # Generate predictions using the trained model
  predictions <- predict(model, new_data_x)
  
  # Create a new data frame with the predictions
  submission <- sample_submission
  submission$Prediction <- predictions
  
  # Save the predictions to a CSV file
  write.csv(submission, file = output_path, row.names = FALSE)
}

# Example usage:
# Assuming `rf_model` is your trained Random Forest model and `test_data_scaled` is your new data


# Specify the path to the sample submission file and the output path for the predictions
sample_submission_path <- "SampleSubmission.csv"
output_path <- "Predictions.csv"

# Apply the model to the new data and save predictions
apply_model_to_data(rf_model, test_df, sample_submission_path, output_path)

# Verify that the predictions file has been created correctly
print("Predictions saved to Predictions.csv")

[1] "Predictions saved to Predictions.csv"
