In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
print('lets get started')

Create splits

In [None]:
data <- read.table("../input/imdb-movies/alldata.tsv", stringsAsFactors = FALSE,
                  header = TRUE)
testIDs <- read.csv("../input/imdb-movies/project3_splits.csv", header = TRUE)
for(j in 1:5){
  dir.create(paste("split_", j, sep=""))
  train <- data[-testIDs[,j], c("id", "sentiment", "review") ]
  test <- data[testIDs[,j], c("id", "review")]
  test.y <- data[testIDs[,j], c("id", "sentiment", "score")]
  
  tmp_file_name <- paste("split_", j, "/", "train.tsv", sep="")
  write.table(train, file=tmp_file_name, 
              quote=TRUE, 
              row.names = FALSE,
              sep='\t')
  tmp_file_name <- paste("split_", j, "/", "test.tsv", sep="")
  write.table(test, file=tmp_file_name, 
              quote=TRUE, 
              row.names = FALSE,
              sep='\t')
  tmp_file_name <- paste("split_", j, "/", "test_y.tsv", sep="")
  write.table(test.y, file=tmp_file_name, 
            quote=TRUE, 
            row.names = FALSE,
            sep='\t')
}

In [None]:
# load training data, clean html tags

# j = 1
# setwd(paste("split_", j, sep=""))
# train = read.table("train.tsv",
#                    stringsAsFactors = FALSE,
#                    header = TRUE)
# train$review = gsub('<.*?>', ' ', train$review)

In [2]:
# use all words
train = read.table("../input/imdb-movies/alldata.tsv", stringsAsFactors = FALSE,
                  header = TRUE)
train$review = gsub('<.*?>', ' ', train$review)

In [4]:
library(rsparse)
library(Rcpp)
library(text2vec)
library(glmnet)
library(pROC)

In [5]:
stop_words = c("i", "me", "my", "myself", 
               "we", "our", "ours", "ourselves", 
               "you", "your", "yours", 
               "their", "they", "his", "her", 
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were", 
               "him", "himself", "has", "have", 
               "it", "its", "the", "us")
it_train = itoken(train$review,
                  preprocessor = tolower, 
                  tokenizer = word_tokenizer)
tmp.vocab = create_vocabulary(it_train, 
                              stopwords = stop_words, 
                              ngram = c(1L,4L))
tmp.vocab = prune_vocabulary(tmp.vocab, term_count_min = 10,
                             doc_proportion_max = 0.5,
                             doc_proportion_min = 0.001)
dtm_train  = create_dtm(it_train, vocab_vectorizer(tmp.vocab))  

In [6]:
set.seed(9021)
tmpfit = glmnet(x = dtm_train, 
                y = train$sentiment, 
                alpha = 1,
                family='binomial')
tmpfit$df

In [13]:
which(tmpfit$df==976)

In [11]:
myvocab = colnames(dtm_train)[which(tmpfit$beta[, 36] != 0)]

In [14]:
length(myvocab)

In [15]:
# for split 1 words
tmpfit$df[70]

### For testing on first split

In [16]:
train = read.table("train.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
 train$review <- gsub('<.*?>', ' ', train$review)
 it_train = itoken(train$review,
                    preprocessor = tolower, 
                    tokenizer = word_tokenizer)
 vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
 dtm_train = create_dtm(it_train, vectorizer)

In [None]:
fit1 = cv.glmnet(x = dtm_train, 
                y = train$sentiment, 
                alpha = 0,
                family='binomial')

In [None]:
test = read.table("test.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
 test$review <- gsub('<.*?>', ' ', test$review)
 it_test = itoken(test$review,
                    preprocessor = tolower, 
                    tokenizer = word_tokenizer)
 vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
 dtm_test = create_dtm(it_test, vectorizer)

In [None]:
fit1$lambda.min

In [None]:
predicted = predict(fit1, dtm_test, s=fit1$lambda.min, type = 'response')

pred = factor(ifelse(predicted > 0.5, 1, 0))

In [None]:
test_y = read.table("test_y.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)

In [None]:
# auc(test_y, predicted)
auc(test_y$sentiment, predicted)

### Test on first split done

In [17]:
auc_for_splits <- rep(0, 5)

for (j in 1:5) {
  
setwd('/kaggle/working')

# j = 1
setwd(paste("split_", j, sep=""))


train = read.table("train.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
 train$review <- gsub('<.*?>', ' ', train$review)
 it_train = itoken(train$review,
                    preprocessor = tolower, 
                    tokenizer = word_tokenizer)
 vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
 dtm_train = create_dtm(it_train, vectorizer)


fit1 = cv.glmnet(x = dtm_train, 
                y = train$sentiment, 
                alpha = 0,
                family='binomial')


test = read.table("test.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)
 test$review <- gsub('<.*?>', ' ', test$review)
 it_test = itoken(test$review,
                    preprocessor = tolower, 
                    tokenizer = word_tokenizer)
 vectorizer = vocab_vectorizer(create_vocabulary(myvocab, 
                                                  ngram = c(1L, 2L)))
 dtm_test = create_dtm(it_test, vectorizer)


predicted = predict(fit1, dtm_test, s=fit1$lambda.min, type = 'response')


test_y = read.table("test_y.tsv",
                   stringsAsFactors = FALSE,
                   header = TRUE)


auc_for_splits[j] <- auc(test_y$sentiment, predicted)
    
    }

print(auc_for_splits)