# Instalación de paquetes

In [None]:
if("tidyverse" %in% rownames(installed.packages()) == FALSE) {install.packages("tidyverse")}
if("kernlab" %in% rownames(installed.packages()) == FALSE) {install.packages("kernlab")}
if("nnet" %in% rownames(installed.packages()) == FALSE) {install.packages("nnet")}

# Conjunto de Datos

##  Obtención de datos

In [None]:
library("kernlab")
library("dplyr")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [None]:
# Carga de datos y preprocesamiento
data(spam)
data <- spam
data$Y0 <- as.integer(data$type != "spam")
data$Y1 <- as.integer(data$type == "spam")
data <- data %>% select(-type)

## División en conjuntos de entrenamiento y prueba




In [None]:
# 1. Dividimos en conjuntos disjuntos de 75% y 25% del total
sample_proportion <- 0.75
# 2. Creamos un ID por fila para hacer la division estratificada del conjunto
data <- data %>% mutate(id = row_number())
# 3. Dividimos en conjuntos de entrenamiento y prueba
set.seed(1234)
train <- data %>% sample_frac(sample_proportion)
test  <- anti_join(data, train, by = 'id')
# 4. Eliminamos la columna auxiliar
train <- select(train, -id)
test <- select(test, -id)
data <- select(data, -id)

### Proporcion de clases

In [None]:
# Verificamos que tengan la misma proporcion de registros
print("Data size")
table(data$Y1) / nrow(data)

print("Train size")
table(train$Y1) / nrow(train)

print("Test size")
table(test$Y1) / nrow(test)

[1] "Data size"



        0         1 
0.6059552 0.3940448 

[1] "Train size"



        0         1 
0.6079397 0.3920603 

[1] "Test size"



  0   1 
0.6 0.4 

# Redes Neuronales

In [None]:
library("nnet")

## Usando todas las variables

In [None]:
set.seed(1234)
nn <- nnet(
    cbind(Y0, Y1) ~ ., 
    data=train, 
    size=15, 
    maxit=1000, 
    softmax=TRUE
)

pred <- predict(nn, newdata = test)
# Dado que la red regresa un vector de probabilidades, 
# usamos la componente que nos indica la probabilidad 
# de que un correo sea spam
aux <- c()
for(i in 1:nrow(pred)){
    if(pred[i,1] <= pred[i,2]){
        aux <- c(aux, 1)
    }else{
        aux <- c(aux, 0)
    }
}
pred <- as.integer(aux)

# weights:  902
initial  value 2908.828317 
iter  10 value 2042.076423
iter  20 value 1794.346297
iter  30 value 1456.353596
iter  40 value 855.859131
iter  50 value 518.644080
iter  60 value 459.269415
iter  70 value 453.857965
iter  80 value 441.100441
iter  90 value 418.528704
iter 100 value 374.834785
iter 110 value 343.338827
iter 120 value 317.229615
iter 130 value 286.459046
iter 140 value 264.251530
iter 150 value 245.907965
iter 160 value 228.777736
iter 170 value 226.305623
iter 180 value 225.545695
iter 190 value 223.474032
iter 200 value 220.505736
iter 210 value 218.095927
iter 220 value 215.229497
iter 230 value 212.600099
iter 240 value 207.006950
iter 250 value 203.718977
iter 260 value 198.925206
iter 270 value 192.327694
iter 280 value 189.989411
iter 290 value 187.595063
iter 300 value 183.427916
iter 310 value 179.384052
iter 320 value 174.117134
iter 330 value 171.748344
iter 340 value 166.900975
iter 350 value 163.310695
iter 360 value 160.813526
iter 370 value 15

In [None]:
"Matriz de confusion"
table <- table(test$Y1, pred)
table
"Precision sobre conjunto de prueba"
accuracy <- sum(diag(table)) / sum(table)
accuracy

   pred
      0   1
  0 657  33
  1  47 413

In [None]:
set.seed(1234)
nn <- nnet(
    cbind(Y0, Y1) ~ ., 
    data=train, 
    size=10,
    maxit=1000, 
    softmax=TRUE
)

pred <- predict(nn, newdata = test)
aux <- c()
for(i in 1:nrow(pred)){
    if(pred[i,1] <= pred[i,2]){
        aux <- c(aux, 1)
    }else{
        aux <- c(aux, 0)
    }
}
pred <- as.integer(aux)

# weights:  602
initial  value 2303.511575 
iter  10 value 1312.739347
iter  20 value 823.809494
iter  30 value 684.941742
iter  40 value 647.432974
iter  50 value 620.620015
iter  60 value 523.073696
iter  70 value 457.120510
iter  80 value 416.115628
iter  90 value 382.935239
iter 100 value 365.154950
iter 110 value 350.990205
iter 120 value 341.496466
iter 130 value 331.674918
iter 140 value 324.010490
iter 150 value 317.267326
iter 160 value 315.222048
iter 170 value 311.943835
iter 180 value 309.435248
iter 190 value 307.711964
iter 200 value 305.825835
iter 210 value 304.383399
iter 220 value 300.961263
iter 230 value 300.745975
iter 240 value 300.703008
iter 250 value 300.674657
iter 260 value 300.623749
iter 270 value 300.575657
iter 280 value 300.559787
iter 290 value 300.551787
iter 300 value 300.547991
iter 310 value 300.544914
iter 320 value 300.537820
iter 330 value 300.533460
iter 340 value 300.530506
iter 350 value 300.522879
iter 360 value 300.503035
iter 370 value 300.

In [None]:
# Matriz de confusion
table <- table(test$Y1, pred)
table
# Precision sobre conjunto de prueba
accuracy <- sum(diag(table)) / sum(table)
accuracy

   pred
      0   1
  0 663  27
  1  55 405

## Seleccionando variables

In [None]:
set.seed(1234)
nn <- nnet(
    cbind(Y0, Y1) ~ hp + hpl + num650 + lab + labs + 
    telnet + num857 + num415 + num85 + technology + 
    direct + capitalLong + capitalAve + capitalTotal, 
    data=train, 
    softmax=TRUE,
    maxit=1000,
    size=50
)

pred <- predict(nn, newdata = test)
aux <- c()
for(i in 1:nrow(pred)){
    if(pred[i,1] <= pred[i,2]){
        aux <- c(aux, 1)
    }else{
        aux <- c(aux, 0)
    }
}
pred <- as.integer(aux)

# weights:  852
initial  value 4220.461887 
iter  10 value 2071.159523
iter  20 value 1819.621284
iter  30 value 1478.594260
iter  40 value 1365.946785
iter  50 value 1259.499232
iter  60 value 1230.994010
iter  70 value 1221.298125
iter  80 value 1209.197045
iter  90 value 1172.575258
iter 100 value 1154.758139
iter 110 value 1137.255218
iter 120 value 1128.085311
iter 130 value 1117.070411
iter 140 value 1114.230972
iter 150 value 1112.037065
iter 160 value 1110.229777
iter 170 value 1109.433826
iter 180 value 1107.507716
iter 190 value 1107.205115
iter 200 value 1105.660404
iter 210 value 1099.204694
iter 220 value 1089.075282
iter 230 value 1088.909808
iter 240 value 1088.051291
iter 250 value 1084.630346
iter 260 value 1080.359611
iter 270 value 1072.595802
iter 280 value 1065.812217
iter 290 value 1060.528271
iter 300 value 1055.079176
iter 310 value 1054.495660
iter 320 value 1052.850607
iter 330 value 1050.333221
iter 340 value 1047.562975
iter 350 value 1045.294652
iter 360 va

In [None]:
# Matriz de confusion
table <- table(test$Y1, pred)
table
# Precision sobre conjunto de prueba
accuracy <- sum(diag(table)) / sum(table)
accuracy

   pred
      0   1
  0 607  83
  1  76 384

In [None]:
set.seed(1234)
nn <- nnet(
    cbind(Y0, Y1) ~ hp + hpl + num650 + lab + labs + 
    telnet + num857 + num415 + num85 + technology + 
    direct + capitalLong + capitalAve + capitalTotal, 
    data=train, 
    softmax=TRUE,
    maxit=1000,
    size=15
)

pred <- predict(nn, newdata = test)
aux <- c()
for(i in 1:nrow(pred)){
    if(pred[i,1] <= pred[i,2]){
        aux <- c(aux, 1)
    }else{
        aux <- c(aux, 0)
    }
}
pred <- as.integer(aux)

# weights:  257
initial  value 2478.135962 
iter  10 value 2154.292478
iter  20 value 2018.803881
iter  30 value 1783.632248
iter  40 value 1552.297753
iter  50 value 1396.279031
iter  60 value 1359.655937
iter  70 value 1302.398496
iter  80 value 1250.981195
iter  90 value 1206.506859
iter 100 value 1172.692908
iter 110 value 1168.249911
iter 120 value 1156.571874
iter 130 value 1143.874499
iter 140 value 1139.455709
iter 150 value 1138.136261
iter 160 value 1137.742831
iter 170 value 1136.429076
iter 180 value 1132.764369
iter 190 value 1130.312357
iter 200 value 1124.783165
iter 210 value 1123.917753
iter 220 value 1122.336715
iter 230 value 1116.844685
iter 240 value 1111.440980
iter 250 value 1107.747532
iter 260 value 1106.090475
iter 270 value 1103.833419
iter 280 value 1102.659770
iter 290 value 1101.238712
iter 300 value 1099.120931
iter 310 value 1097.695273
iter 320 value 1097.272217
iter 330 value 1097.174786
iter 340 value 1096.957114
iter 350 value 1096.863155
iter 360 va

In [None]:
# Matriz de confusion
table <- table(test$Y1, pred)
table
# Precision sobre conjunto de prueba
accuracy <- sum(diag(table)) / sum(table)
accuracy

   pred
      0   1
  0 603  87
  1  78 382

In [None]:
set.seed(1234)
nn <- nnet(
    cbind(Y0, Y1) ~ charDollar+num000+your+remove+free+business+our+hp+charExclamation+capitalTotal+receive, 
    data=train, 
    softmax=TRUE,
    maxit=1000,
    size=10
)

pred <- predict(nn, newdata = test)
aux <- c()
for(i in 1:nrow(pred)){
    if(pred[i,1] <= pred[i,2]){
        aux <- c(aux, 1)
    }else{
        aux <- c(aux, 0)
    }
}
pred <- as.integer(aux)

# weights:  142
initial  value 2612.197262 
iter  10 value 2087.202127
iter  20 value 1127.403177
iter  30 value 892.132654
iter  40 value 806.947697
iter  50 value 776.346038
iter  60 value 767.025751
iter  70 value 758.214741
iter  80 value 741.390407
iter  90 value 739.277247
iter 100 value 738.953564
iter 110 value 738.917244
iter 120 value 738.228172
iter 130 value 737.137894
iter 140 value 731.333779
iter 150 value 719.290561
iter 160 value 707.624600
iter 170 value 706.444213
iter 180 value 705.365462
iter 190 value 703.490010
iter 200 value 703.334510
iter 210 value 703.327218
iter 220 value 703.299480
iter 230 value 703.260936
iter 240 value 703.243752
iter 250 value 703.238124
iter 260 value 703.233124
iter 270 value 703.228104
iter 280 value 703.225076
iter 290 value 703.222357
iter 300 value 703.217221
iter 310 value 703.202578
iter 320 value 703.187507
iter 330 value 703.174136
iter 340 value 703.160331
iter 350 value 703.134108
iter 360 value 703.099037
iter 370 value 703

In [None]:
# Matriz de confusion
table <- table(test$Y1, pred)
table
# Precision sobre conjunto de prueba
accuracy <- sum(diag(table)) / sum(table)
accuracy

   pred
      0   1
  0 654  36
  1  53 407