# Data Cleaning for Model Fitting
The data we have are problematic - we have 8 categorical variables that need to be split into model matrices, and some of those variables are incompatible with the test data, as there are extra factor levels.  This script will clean the training/test data into two separate data frames each: One for the base run, and one for the run without X0, X2,and X5, so that the remaining test observations can be properly fit.

In [8]:
options(stringsAsFactors = FALSE)
suppressWarnings(suppressMessages({
    library(dplyr)
    library(grplasso)
}))

In [9]:
train <- read.csv('../data/train.csv')
test <- read.csv('../data/test.csv')

In [10]:
# Extract covariates
covars <- train %>% select(starts_with('X'))
cat_names <- colnames(covars)[sapply(covars, function(x) class(x) == 'character')]
num_names <- colnames(covars)[sapply(covars, function(x) class(x) != 'character')]

# Categorical variables
cat_vars_train <- train[cat_names]
cat_vars_test <- test[cat_names]

# Quantitative variables
num_vars_train <- train[num_names]
num_vars_test <- test[num_names]

### Prune Categorical Variables
Create model matrices and indicesthat will be used in the grouped lasso

In [17]:
# Produce categorical model matrices, remove bad cols in test data
cat_mm_train <- as.data.frame(model.matrix(~ . - 1, data = cat_vars_train))
cat_mm_test <- as.data.frame(model.matrix(~ . -1, data = cat_vars_test))
cat_mm_test <- cat_mm_test %>% 
    select(-starts_with('X0'),
           -starts_with('X2'),
           -starts_with('X5'))

In [63]:
# Create indices for grouped lasso
ind_dict_1 <- list()
ind_dict_2 <- list()

for (cn in colnames(cat_vars_train)){
    ind_dict_1[[cn]] <- grep(cn, colnames(cat_mm_train))
    ind_dict_2[[cn]] <- grep(cn, colnames(cat_mm_test))
}

grp_ind_1 <- rep(NA, ncol(cat_mm_train))
grp_ind_2 <- rep(NA, ncol(cat_mm_test))
for (i in 1:ncol(cat_vars_train)) {
    cn  <- colnames(cat_vars_train)[i]
    grp_ind_1[ind_dict_1[[cn]]] <- i    
    grp_ind_2[ind_dict_2[[cn]]] <- i
}

### Prune Quantitative Variables
Remove quantitative variables that are "too" unary

In [73]:
# "Too unary" = 1% away from being all zeros or all ones
cut_tol <- 0.01

# Look at means (purity) of each variable
bin_purity <- unname(sapply(num_vars_train, mean))
summary(bin_purity)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.004217 0.022330 0.157700 0.195800 0.999800 

In [74]:
# Trim based on this tolerance cut.  Keeping 221 variables.
num_keep <- (bin_purity < 1 - cut_tol) & (bin_purity > cut_tol)
table(num_keep)

num_keep
FALSE  TRUE 
  147   221 

In [75]:
# Remove variables we don't want
num_vars_train <- num_vars_train[num_keep]
num_vars_test <- num_vars_test[num_keep]

In [76]:
# Make final grouped lasso indices - NAs for ungrouped obs
grp_ind_1 <- c(grp_ind_1, rep(NA, ncol(num_vars_train)))
grp_ind_2 <- c(grp_ind_2, rep(NA, ncol(num_vars_test)))

In [83]:
y  <- log(train$y)
df <- cbind(y, cat_mm_train, num_vars_train)

In [87]:
m1 <- grplasso(y ~ ., data = df, lambda = 10, index = grp_ind_1, model = LinReg())

ERROR: Error in grplasso.default(x = l$x, y = l$y, index = l$index, weights = l$w, : formal argument "index" matched by multiple actual arguments


In [None]:
# Problem!  Fix later