 This project contains voting data provided by [YouGov](https://www.voterstudygroup.org/publication/2019-voter-survey-full-data-set) which contains information on peronal characteristics and answers to survey questions. Most of the surveyed people participated in ealrier surveys available on the companies hompage.<br>
This project tough only focuses on the cross-section. Extensions are possible. <br>
Since the survey contains questions and personal characteristics in dimensions greater than 1000 the aim of this project is to twofold: <br>
First to reduce the dimension in order to obtain voter categories determined by a small number of parameters. <br>
Second to check wheter membership to a cretain group can reliable predict voting behavior.

In [1]:
library(tidyverse, quietly = TRUE)
library(data.table, quietly = TRUE)
# library(glmnet) # LASSO
library(factoextra, quietly = TRUE) # PCA
library(nnet, quietly = TRUE)
library(ade4, quietly = TRUE)
library(rpart, quietly = TRUE)
library(rpart.plot, quietly = TRUE)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.5     [32m✔[39m [34mdplyr  [39m 1.0.3
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘package:purrr’:

    transpose


Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa



In [2]:
path_sven <- "~/06_data/VOTER_Survey_Jan217_Release1-csv.csv"
data <- fread(file = path_sven, header = TRUE, sep = ",", na.strings = c("__NA__","."), stringsAsFactors = TRUE)

the data set contains 1282 variables where the first 11 columns are population weights and not of interest in further analysis.

# 1. Data Cleaning

Since the dataset contains quite a lot of variables in further analysis we will focus on only a subset for reasons of tractability

In [3]:
data <- data[vote2020_2019 == 2 | vote2020_2019 == 1] # only Trump or Dem
# weights not of interest
weight_idx <- grepl(".*weight.*", colnames(data)) 
data <- data[,!weight_idx, with = FALSE]
# name of candidate not of interest
name_idx <- grepl(".*Cand.Name.*", colnames(data)) 
data <- data[,!name_idx, with = FALSE]
# response to july 2017 survey
survey2017_idx <- grepl(".*_2017", colnames(data)) 
data <- data[,!survey2017_idx, with = FALSE]
#response to may 2018 survey
survey2018_idx <- grepl(".*_2018", colnames(data)) 
data <- data[,!survey2018_idx, with = FALSE]
# cassfullcd
cassfullcd_idx <- grepl("cassfullcd", colnames(data))
data <- data[,!cassfullcd_idx, with = FALSE]
# send candidate
send_idx <- grepl("Sen.*", colnames(data))
data <- data[, !send_idx,with = FALSE]

In [4]:
drop_names <- c("trumpapp_2019", "turnout18post_2019", "housevote_2019", "senatevote_2019", "senatevote2_2019", "governorvote_2019",
                "GovCand1Party_2019", "GovCand2Party_2019", "GovCand3Party_2019",
                "Clinton_Rubio_rnd_2016", "Clinton_Cruz_rnd_2016", "Sanders_Trump_rnd_2016" ,"post_HouseCand1Party_2012", "post_HouseCand2Party_2012",
                "post_HouseCand3Party_2012")
drop_names_idx <- colnames(data)  %in% drop_names
data <- data[, !drop_names_idx,with = FALSE]

If there are more than 8 possible answers one could specify a range and 98 indicates "skipped"
Otherwise 8 was used to refrence to "skipped"

In [5]:
idx_na <- colSums(is.na(data)) > 0
data <- data[, !idx_na, with = FALSE]
col_names <- colnames(data)
for(i in 2:ncol(data)){
  if(nlevels(data[[col_names[i]]]) > 8 ){
    for(j in 1: nrow(data)){
      if(data[[j,i]] == 8){
        data[[j,i]] <- NA
      }
    }
  }else{
    for(j in 1:nrow(data)){
      if(data[[j,i]] == 98){
        data[[j,i]] <- NA
      }
      
    }
  }
}

In [6]:
idx_na <- colSums(is.na(data)) > 0
data <- data[, !idx_na, with = FALSE]

All answers are factor variables due to survey design

In [7]:
col_names <- colnames(data)
data <- data[, (col_names):= lapply(.SD, as.factor), .SDcols = col_names]
str(data)


Classes ‘data.table’ and 'data.frame':	5896 obs. of  158 variables:
 $ vote2020_2019                 : Factor w/ 2 levels "1","2": 2 2 2 1 2 2 2 2 2 2 ...
 $ add_confirm_2019              : Factor w/ 3 levels "1","2","9": 1 1 1 1 1 1 1 3 1 2 ...
 $ izip_2019                     : Factor w/ 4520 levels "1001","1020",..: 4206 2600 3178 3441 4207 1989 197 1477 3754 3057 ...
 $ votereg_2019                  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
 $ votereg_f_2019                : Factor w/ 3 levels "1","2","9": 1 1 1 1 1 1 1 1 1 1 ...
 $ regzip_2019                   : Factor w/ 162 levels "4358","5770",..: 162 162 162 162 162 162 162 162 162 162 ...
 $ region_2019                   : Factor w/ 4 levels "1","2","3","4": 4 2 3 3 4 3 1 3 4 2 ...
 $ inst_court_2019               : Factor w/ 5 levels "1","2","3","4",..: 3 3 4 1 3 3 3 2 3 3 ...
 $ inst_media_2019               : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 4 2 4 4 1 1 1 ...
 $ inst_congress_2019            : 

In [9]:
cat("number of na's is now: ", sum(is.na(data)), "\n")
cat("dimension of new dataset is now: ", dim(data))

number of na's is now:  0 
dimension of new dataset is now:  5896 158

In [15]:
data_dummy <- acm.disjonctif(data_new)

In [None]:
dat <- data_new[, colSums()]

In [13]:
head(data_dummy)
cat("dimension of data dummy:", dim(data_dummy))

Unnamed: 0_level_0,trumpapp_2019.1,trumpapp_2019.2,trumpapp_2019.3,trumpapp_2019.4,trumpapp_2019.5,trumpapp_2019.8,fav_trump_2019.1,fav_trump_2019.2,fav_trump_2019.3,fav_trump_2019.4,⋯,statefips_baseline.47,statefips_baseline.48,statefips_baseline.49,statefips_baseline.50,statefips_baseline.51,statefips_baseline.53,statefips_baseline.54,statefips_baseline.55,statefips_baseline.56,statefips_baseline.78
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,0,1,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,1,0,0,⋯,0,1,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0


dimension of data dummy: 5896 34298

In [61]:
data_new <- data.frame(cbind(y, data_new))

In [62]:
glimpse(data_new)

Rows: 5,896
Columns: 329
$ y                              [3m[38;5;246m<fct>[39m[23m Democrat, Democrat, Democrat, Donald_T…
$ trumpapp_2019                  [3m[38;5;246m<fct>[39m[23m 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 1,…
$ fav_trump_2019                 [3m[38;5;246m<fct>[39m[23m 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 1,…
$ fav_obama_2019                 [3m[38;5;246m<fct>[39m[23m 1, 1, 1, 2, 1, 1, 98, 1, 1, 1, 1, 1, 4…
$ fav_hrc_2019                   [3m[38;5;246m<fct>[39m[23m 1, 1, 4, 4, 1, 3, 1, 1, 1, 1, 2, 1, 4,…
$ fav_sanders_2019               [3m[38;5;246m<fct>[39m[23m 1, 2, 4, 3, 3, 3, 2, 2, 2, 2, 3, 2, 4,…
$ fav_putin_2019                 [3m[38;5;246m<fct>[39m[23m 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3,…
$ fav_schumer_2019               [3m[38;5;246m<fct>[39m[23m 1, 1, 3, 4, 2, 2, 2, 2, 1, 1, 2, 2, 8,…
$ fav_pelosi_2019                [3m[38;5;246m<fct>[39m[23m 1, 1, 3, 4, 1, 2, 1, 2, 2, 1, 1, 2, 4,…
$ fav_comey_2019                 [3m[

# 2. 

In [63]:
set.seed(123)
n <- nrow(data_new)
p = ncol(data_new) - 1
ind_train <- sample(x = 1:n, size = ceiling(0.8 * n))
set_train <- data_new[ind_train,]
ind_test <- setdiff(x = 1:n, ind_train)
set_test <- data_new[ind_test,]

In [110]:
X <- data_new[-1]