# Load Library

In [None]:
# preprocessing 
library(dplyr)
library(data.table)
library(tidyverse)
library(devtools)
if(!require('naniar'))(install.packages('naniar'))
library(naniar)
library(corrplot)
library(gridExtra)
library(mlr)

# modeling
library(lightgbm)
library(xgboost)
library(ranger)
library(caret)

# Load Dataset

In [None]:
train <- fread('../input/train.csv',stringsAsFactors = F)
test <- fread('../input/test.csv',stringsAsFactors = F)

glimpse(train)
glimpse(test)

In [None]:
test$login <- NA
train$Train_or_Test <- "Train"
test$Train_or_Test <- "Test"

all <- rbind(train,test)

- person_id: 유저별 고유 아이디
- Sex: 성별
- past_login_total: 과거(5월 8일 이전)에 로그인한 총 횟수
- past_1_month_login: 과거 1달간 로그인한 총 횟수
- past_1_week_login: 과거 1주간 로그인한 총 횟수
- sub_size: 과거(5월 8일 이전)에 데이콘 대회에서의 총 제출 수
- email_type: 가입한 이메일 종류
- phone_rat: 폰으로 접속한 비율
- apple_rat: 애플 기기로 접속한 비율
- login: 로그인 여부 

In [None]:
summary(all)

모든 변수들은 음수 관측값을 가지지 않아야한다. 확인 결과, 음수값은 존재하지 않았다.

# EDA

## Target Variable(login)

In [None]:
ggplot(train, aes(x = login,fill = login))+ geom_bar()

## Correlation
- 결측값을 제외한 수치형 변수들의 상관관계를 확인해본 결과는 다음과 같다.

In [None]:
cc <- cor(train[,c('past_login_total','past_1_month_login','past_1_week_login','sub_size','phone_rat','apple_rat','login')],use="complete.obs")
corrplot(cc,method = 'number')

- 과거 1주일간 로그인 횟수와 과거 1달간 로그인 횟수간 관계는 0.74의 높은 양의 상관관계를 보인다.
    - 즉, 1주일간 로그인 횟수가 많다면, 1달간 로그인 횟수도 많다는 의미이다.
- 과거 로그인 총 횟수와 제출 횟수간 관계는 0.55로 양의 상관관계를 보인다. 
    - 제출을 하기위해서는 로그인이 필수이기 때문에, 이런 결과를 보이는 듯 하다.
- Target variable(login)은 과거 1주일간 로그인 횟수, 과거 1달간 로그인 횟수, 과거 로그인 총 횟수, 제출 횟수들과 약한 양의 상관 관계를 보인다.

In [None]:
colSums(is.na(all))

In [None]:
vis_miss(all)

- 전체 데이터의 6.2%가 결측값으로 나타났다. 6.2%의 1/3은 Test data의 label value로 나타났다.
- 나머지 결측값은 past_login_total, past_1_month_login, past_1_week_login에서 동시에 등장했다. 

In [None]:
all$computer_rat <- 1 - all$phone_rat

In [None]:
past_login_loc <- which(is.na(all$past_login_total))

for (i in past_login_loc){
    all[,3][i] <- 0
    all[,4][i] <- 0
    all[,5][i] <- 0
}

multiplot function : http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/

In [None]:
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

In [None]:
p1 <- ggplot(train,
       aes(x = past_login_total,
           y = login)) +
 geom_miss_point()

p2 <- ggplot(train,
       aes(x = past_1_month_login,
           y = login)) +
 geom_miss_point()

p3 <- ggplot(train,
       aes(x = past_1_week_login,
           y = login)) +
 geom_miss_point()

multiplot(p1,p2,p3,cols=1)

Training Data를 이용하여 과거에 로그인한 총 횟수와 로그인 여부간의 관계를 확인해보았다.
- 과거에 로그인한 총 횟수가 높을수록 로그인 여부도 1인 경향을 보였다. 
- 과거 1달 동안 로그인 총 횟수가 높을수록 로그인 여부도 1인 경향을 보였다.
- 과거 1주일 동안 로그인 총 횟수가 높을수록 로그인 여부도 1인 경향을 보였다. 

In [None]:
all$Sex <- factor(all$Sex)
all$email_type <- factor(all$email_type)

all <-  all %>% select(-person_id)

In [None]:
train <- all %>% filter(Train_or_Test == 'Train') %>% select(-Train_or_Test)
test <- all %>% filter(Train_or_Test == 'Test') %>% select(-Train_or_Test)

## Modeling

In [None]:
train_label <- train$login
test_label <- test$login

In [None]:
x_train <- model.matrix(~.-1,data=train[,-9]) %>% data.frame
x_test <- model.matrix(~.-1,data=test[,-9]) %>% data.frame

dtrain <- xgb.DMatrix(data = as.matrix(x_train), label = train_label)
dtest <- xgb.DMatrix(data = as.matrix(x_test))

In [None]:
set.seed(9302)
xgb_cv <- xgboost::xgb.cv(
  data = dtrain,
  objective = "binary:logistic", verbos = 1,
  nrounds = 1000, subsample = 0.67,
  colsample_bytree = 0.8,
  nfold = 5, eta = 0.01, gamma = 0,
  early_stopping_rounds = 100, max_depth = 4,
  min_child_weight = 5, alpha = 0.5
)
bst <- xgb_cv$best_iteration

In [None]:
params <- list(
  objective = "reg:logistic",
  eval_metric = "auc",
  eta = 0.01,
  gamma = 0,
  max_depth = 4,
  sub_sample =0.67, 
  colsample_bytree = 0.8,
  nthread = -1,
  min_child_weight = 5,
  alpha = 0.5
)

fitting_xgb <- xgb.train(data = dtrain, params = params,nrounds = bst)
prediction_xgb <- predict(fitting_xgb,dtest)

In [None]:
xgb.importance(colnames(dtrain), model = fitting_xgb) %>%
  xgb.plot.importance(top_n = 10)

# LightGBM

In [None]:
x_train <- model.matrix(~.-1,data=train[,-9]) 
x_test <- model.matrix(~.-1,data=test[,-9]) 

In [None]:
train.lgb <- lgb.Dataset(data = x_train, label = train[,9])

In [None]:
lgb.grid <- list(objective = "binary",
                metric = "auc",
                learing_rate = 0.01,
                num_leaves = 15,
                feature_fraction = 0.7,
                bagging_fraction = 0.7,
                bagging_freq = 5
                )

set.seed(0514)
lgb.cv <- lgb.cv(param=lgb.grid, data=train.lgb, learning_rate = 0.01, num_leaves = 15, num_threads = 4, 
                nrounds = 5000, early_stopping_rounds = 50, eval_freq = 20, nfold = 5, stratified = F)               



In [None]:
best_iter <- lgb.cv$best_iter

fitting_lgb <- lgb.train(params = lgb.grid,data = train.lgb,learning_rate = 0.01,nrounds = best_iter, eval_freq = 100)
prediction_lgb <- predict(fitting_lgb,x_test)

# Logistic Regression

In [None]:
glm_data <- all
glm_data$login <- as.numeric(glm_data$login)
str(glm_data)

In [None]:
tmp_sex <- glm_data$Sex %>%createDummyFeatures(tmp_sex,cols='Sex')

tmp_email <- glm_data$email_type %>%createDummyFeatures(tmp_email,cols='email')

glm_data <- cbind(glm_data,tmp_sex,tmp_email) %>% select(-c(Sex,email_type))

In [None]:
train_set <- glm_data %>% filter(Train_or_Test=="Train") %>% select(-Train_or_Test)
test_set <- glm_data %>% filter(Train_or_Test=="Test") %>% select(-Train_or_Test)

In [None]:
str(train_set)

In [None]:
set.seed(930217)
fitting_logistic <- glm(login~scale(past_login_total)+scale(past_1_month_login)+scale(past_1_week_login)+scale(sub_size)
                        +phone_rat+apple_rat+male+female+gmail+hanmail+nate+naver+other,data=train_set, family = binomial)

nothing <- glm(login~1,data = train_set, family = binomial)

In [None]:
forwards  <- step(nothing,
                 scope=list(lower=formula(nothing),upper=formula(fitting_logistic)), direction='forward')

In [None]:
set.seed(930217)
fitting_logistic <- glm(login~scale(past_1_month_login)+scale(past_1_week_login)+scale(sub_size)
                        +male,data=train_set, family = binomial)

In [None]:
prediction_logit <- predict(fitting_logistic,test_set,type='response')
summary(fitting_logistic)

In [None]:
d1 = density(prediction_xgb)
d2 = density(prediction_lgb)
d3 = density(prediction_logit)

plot(d1,col="green",lwd=1,lty=1,main = "Density plot")
lines(d2,col="blue",lwd=1,lty=2)
lines(d3,col="red",lwd=1,lty=3)
legend("topright",legend=c("XGB","LGBM",'Logit'),
       col=c("green","blue","red"),lty=1:3)

# Stacking Model

In [None]:
test_id <- fread('../input/test.csv',stringsAsFactors = F) %>% select(person_id)

prediction <- cbind(prediction_xgb,prediction_lgb,prediction_logit)
prediction <- as.data.frame(prediction)

In [None]:
prediction$sum_prediction <- prediction$prediction_xgb+prediction$prediction_lgb+prediction$prediction_logit
prediction$rank <- rank(prediction$sum_prediction)

In [None]:
# Min max Normalization function 

normalized <- function(x) {
    (x - min(x)) / (max(x) - min(x))
}

In [None]:
prediction$normailzed_rank <- normalized(prediction$rank)
prediction$mean_stacking <- (prediction$prediction_xgb+prediction$prediction_lgb+prediction$prediction_logit)/3

# Output 

In [None]:
df_mean_stacking <- data.frame(test_id,prediction$mean_stacking)
df_normalized_rank <- data.frame(test_id,prediction$normailzed_rank)
df_xgb_pred <- data.frame(test_id,prediction$prediction_xgb)
df_lgb_pred <- data.frame(test_id,prediction$prediction_lgb)
df_logistic_pred <- data.frame(test_id,prediction$prediction_logit)

write.csv(df_mean_stacking,"mean_stacking_submissuon.csv",row.names=F)
write.csv(df_normalized_rank,"rank_submission.csv",row.names=F)
write.csv(df_xgb_pred,"xgb_prediction.csv",row.names = F)
write.csv(df_lgb_pred,"lgb_prediction.csv",row.names =F)
write.csv(df_logistic_pred,"logistic_prediction.csv",row.names = F)