# Digit recognizer

In [1]:
#Configuraciones iniciales
options(scipen = 999, repr.plot.width=4, repr.plot.height= 4, warn = -1)

In [2]:
#Instalación y carga de paquetes necesarios
list.of.packages <- c('fields', 'ggmap', 'gridExtra', 'lbfgsb3')

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos = "https://cran.r-project.org")

library(fields)
library(ggmap)
library(gridExtra)
library(lbfgsb3)

Loading required package: spam
Loading required package: grid
Spam version 1.4-0 (2016-08-29) is loaded.
Type 'help( Spam)' or 'demo( spam)' for a short introduction 
and overview of this package.
Help for individual functions is also obtained by adding the
suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

Attaching package: 'spam'

The following objects are masked from 'package:base':

    backsolve, forwardsolve

Loading required package: maps
Loading required package: ggplot2
Loading required package: numDeriv


In [3]:
df.train <- read.csv("train.csv")

In [100]:
X <- as.matrix(df.train[, 2:ncol(df.train)])
y <- df.train$label
y[y == 0] <- 10

X <- X / 255#Scale values
dim(X)

In [101]:
#Creamos la función sigmoidal
sigmoid <- function(z) {
  
  g <- 1 / (1 + exp(1) ^ (-z))
}

#Función para calcular el gradiente de la función sigmoidal
sigmoidGradient <- function(z) {
    
    z <- sigmoid(z)
    g  <- z * (1 - z)
    g
}

In [102]:
#Funcion para calcular el costo (con regularización de parámetros)

nnCostFunction <- function(nn_params) {
    
    Theta1 <- matrix(nn_params[1:(hidden_layer_size *(input_layer_size + 1))], hidden_layer_size, input_layer_size +1)
    Theta2 <- matrix(nn_params[(1 + hidden_layer_size * (input_layer_size + 1)):length(nn_params)], 
                     num_labels, hidden_layer_size +1)
    
    Theta1_nz <- Theta1[, 2:ncol(Theta1)]
    Theta2_nz <- Theta2[, 2:ncol(Theta2)]
    
    m <- nrow(X)
    J <- 0
    
    X <- cbind(1, X)
    a1 <- X
    z2 <- a1 %*% t(Theta1)
    a2 <- sigmoid(z2)
    a2 <- cbind(1, a2)
    z3 <- a2 %*% t(Theta2)
    a3 <- sigmoid(z3)
    
    hyp <- a3
    y_mat <- diag(num_labels)
    y_mat <- y_mat[, y]
    
    inner_value <- - t(y_mat) * log(hyp) - (1 - t(y_mat)) * log(1 - hyp)
    J_noreg <- (1/m) * sum(sum(inner_value))
    reg_term <- (lambda / (2 * m)) * sum(sum(sum(Theta1_nz ^ 2)) + sum(sum(Theta2_nz ^ 2)))
    
    J <- J_noreg + reg_term
    
    J
    
}

In [103]:
#Función para calcular los gradientes (con regularización de parámetros)
nnGradFunction <- function(nn_params) {
    
    Theta1 <- matrix(nn_params[1:(hidden_layer_size *(input_layer_size + 1))], hidden_layer_size, input_layer_size +1)
    Theta2 <- matrix(nn_params[(1 + hidden_layer_size * (input_layer_size + 1)):length(nn_params)], 
                     num_labels, hidden_layer_size +1)
    
    Theta1_grad <- matrix(0, ncol = ncol(Theta1), nrow = nrow(Theta1))
    Theta2_grad <- matrix(0, ncol = ncol(Theta1), nrow = nrow(Theta2))
    
    Theta1_nz <- Theta1[, 2:ncol(Theta1)]
    Theta2_nz <- Theta2[, 2:ncol(Theta2)]
    
    m <- nrow(X)
   
    X <- cbind(1, X)
    a1 <- X
    z2 <- a1 %*% t(Theta1)
    a2 <- sigmoid(z2)
    a2 <- cbind(1, a2)
    z3 <- a2 %*% t(Theta2)
    a3 <- sigmoid(z3)
    
    hyp <- a3
    y_mat <- diag(num_labels)
    y_mat <- y_mat[, y]
    
    delta3 <- a3 - t(y_mat)
    z2 <- cbind(1, z2)
    delta2 <- (delta3 %*% Theta2) * sigmoidGradient(z2)
    delta2 <- delta2[, 2:ncol(delta2)]
    
    cap_delta1 <- 0
    cap_delta2 <- 0
    
    cap_delta1 <- cap_delta1 + t(delta2) %*% a1
    cap_delta2 <- cap_delta2 + t(delta3) %*% a2
    
    Theta1[, 1] <- 0
    Theta2[, 1] <- 0
    
    Theta1_grad <- (1 / m) * (cap_delta1 + lambda * Theta1)
    Theta2_grad <- (1 / m) * (cap_delta2 + lambda * Theta2)
    
    grad <- c(c(Theta1_grad), c(Theta2_grad))
}

In [104]:
#Entrenamiento de la red neuronal----------------------------------------------------

#Inicializamos los parámetros de manera aleatoria
randInitializeWeights <- function(L_in, L_out) {
    
    W <- matrix(0, L_out, 1 + L_in)
    epsilon_init <- 0.12
  
    rnd <- runif(L_out * (1 + L_in))
    rnd <- matrix(rnd,L_out,1 + L_in)
    W <- rnd * 2 * epsilon_init - epsilon_init
    W
}

n <- ncol(X)
m <- nrow(X)

input_layer_size <- n
hidden_layer_size <- 25
num_labels <- 10
lambda <- 1

#initial_Theta1 <- randInitializeWeights(input_layer_size, hidden_layer_size)
#initial_Theta2 <- randInitializeWeights(hidden_layer_size, num_labels)

initial_Theta1 <- matrix(runif((n+1)*hidden_layer_size), nrow = hidden_layer_size, ncol=n + 1)
initial_Theta2 <- matrix(runif(num_labels * (hidden_layer_size + 1)), nrow = num_labels, ncol = hidden_layer_size + 1)

initial_nn_params <- c(c(initial_Theta1), c(initial_Theta2))

In [None]:
results <- optim(initial_nn_params, fn = nnCostFunction, gr = nnGradFunction, method = "BFGS", 
               control = list(maxit=100, trace=1, REPORT=1) )

theta <- results$par
J <- results$value

cat("Costo final:", J)

In [None]:
Theta1 <- matrix(theta[1:(hidden_layer_size *(input_layer_size + 1))], hidden_layer_size, input_layer_size +1)
Theta2 <- matrix(theta[(1 + hidden_layer_size * (input_layer_size + 1)):length(nn_params)], 
                     num_labels, hidden_layer_size +1)

In [None]:
#Calculamos la precisión de la red

predict <- function(Theta1, Theta2, X) {
    
    m <- nrow(X)
    num_labels <- nrow(Theta2)
    
    p <- rep(0, m)
    
    h1 <- sigmoid(cbind(1, X) %*% t(Theta1))
    h2 <- sigmoid(cbind(1, h1) %*% t(Theta2))
    
    p <- apply(h2, 1, which.max)
    p
}

In [None]:
df.test <- read.csv("test.csv")
X <- as.matrix(df.test[, 2:ncol(df.test)])
X <- X / 255

In [None]:
predictions <- predict(Theta1, Theta2, X)

predictions[predictions == 10] <- 0

m <- nrow(df.test)
ids <- 1:m

In [None]:
submit <- data.frame(ImageId = ids, )