 This project contains voting data provided by [YouGov](https://www.voterstudygroup.org/publication/2019-voter-survey-full-data-set) which contains information on peronal characteristics and answers to survey questions. Most of the surveyed people participated in ealrier surveys available on the companies hompage.<br>
This project tough only focuses on the cross-section. Extensions are possible. <br>
Since the survey contains questions and personal characteristics in dimensions greater than 1000 the aim of this project is to twofold: <br>
First to reduce the dimension in order to obtain voter categories determined by a small number of parameters. <br>
Second to check wheter membership to a cretain group can reliable predict voting behavior.

In [34]:
options(messages = -1) # does not include errors and warnings
library(tidyverse)
library(data.table)
# library(glmnet) # LASSO
library(factoextra) # PCA
library(nnet)

In [35]:
path_sven <- "~/06_data/VOTER_Survey_Jan217_Release1-csv.csv"
data <- read.csv(file = path_sven, header = TRUE, sep = ",")
head(data)
str(data)

Unnamed: 0_level_0,weight_2016,weight_2017,weight_panel_2018,weight_latino_2018,weight_18_24_2018,weight_overall_2018,weight_2019,weight1_2018,weight1_2019,weight2_2019,⋯,phone_baseline,faminc_baseline,cand_baseline,cdid_baseline,demnom2_baseline,demnom_t_baseline,izip_baseline,lookupzip_baseline,prim12_baseline,statefips_baseline
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<int>,<fct>,<int>,<fct>,<fct>,<fct>,<int>,<fct>
1,0.3582134,0.4381949,0.5025437,,,0.35951,0.361358,0.2290783,0.2100387,0.6184318,⋯,1,6,258342,3,1,,89052,89052,1,32
2,0.5628674,0.3659623,0.388691,,,0.7363869,,0.4530128,,,⋯,1,8,317635,6,3,,85298,85298,2,4
3,0.5521375,0.5498009,0.6841928,,,0.6316563,0.594643,0.5321627,0.4652224,0.6123074,⋯,1,5,102825,6,2,Hillery Clinton,54904,54904,1,55
4,0.2075908,,,,,,0.2738895,,,0.1869269,⋯,1,9,45897,1,1,,74104,74104,3,40
5,0.3337291,0.3459162,0.3224876,,,0.4880045,0.5207127,0.3515442,0.3609241,0.3694292,⋯,1,10,330742,21,3,,78148,78148,2,48
6,0.2071856,0.1476409,0.5941112,,,0.5138021,0.5191906,0.5205669,0.7431791,0.7727754,⋯,3,4,101855,12,1,,94061,94061,1,6


'data.frame':	9548 obs. of  1282 variables:
 $ weight_2016                     : num  0.358 0.563 0.552 0.208 0.334 ...
 $ weight_2017                     : num  0.438 0.366 0.55 NA 0.346 ...
 $ weight_panel_2018               : num  0.503 0.389 0.684 NA 0.322 ...
 $ weight_latino_2018              : num  NA NA NA NA NA NA NA NA NA NA ...
 $ weight_18_24_2018               : num  NA NA NA NA NA NA NA NA NA NA ...
 $ weight_overall_2018             : num  0.36 0.736 0.632 NA 0.488 ...
 $ weight_2019                     : num  0.361 NA 0.595 0.274 0.521 ...
 $ weight1_2018                    : num  0.229 0.453 0.532 NA 0.352 ...
 $ weight1_2019                    : num  0.21 NA 0.465 NA 0.361 ...
 $ weight2_2019                    : num  0.618 NA 0.612 0.187 0.369 ...
 $ weight3_2019                    : num  0.255 NA 0.598 NA 0.476 ...
 $ cassfullcd                      : int  602 405 5506 4001 4828 602 2106 5501 613 5001 ...
 $ vote2020_2019                   : int  2 NA 2 2 1 2 2 NA N

the data set contains 1282 variables where the first 11 columns are population weights and not of interest in further analysis.

# 1. Data Cleaning

Since the dataset contains quite a lot of variables in further analysis we will focus on only a subset for reasons of tractability

In [36]:
# weights not of interest
weight_idx <- grepl(".*weight.*", colnames(data)) 
data <- data[,!weight_idx]
# name of candidate not of interest
name_idx <- grepl(".*Cand.Name.*", colnames(data)) 
data <- data[,!name_idx]
# response to july 2017 survey
survey2017_idx <- grepl(".*_2017", colnames(data)) 
data <- data[,!survey2017_idx]
#response to may 2018 survey
survey2018_idx <- grepl(".*_2018", colnames(data)) 
data <- data[,!survey2018_idx]
# cassfullcd
cassfullcd_idx <- grepl("cassfullcd", colnames(data))
data <- data[,!cassfullcd_idx]
# to be continued

In [37]:
glimpse(data)

Rows: 9,548
Columns: 866
$ vote2020_2019                  [3m[38;5;246m<int>[39m[23m 2, NA, 2, 2, 1, 2, 2, NA, NA, 2, NA, 2…
$ trumpapp_2019                  [3m[38;5;246m<int>[39m[23m 4, NA, 4, 4, 2, 4, 4, NA, NA, 4, NA, 4…
$ fav_trump_2019                 [3m[38;5;246m<int>[39m[23m 4, NA, 4, 4, 2, 4, 4, NA, NA, 4, NA, 4…
$ fav_obama_2019                 [3m[38;5;246m<int>[39m[23m 1, NA, 1, 1, 2, 1, 1, NA, NA, 98, NA, …
$ fav_hrc_2019                   [3m[38;5;246m<int>[39m[23m 1, NA, 1, 4, 4, 1, 3, NA, NA, 1, NA, 1…
$ fav_sanders_2019               [3m[38;5;246m<int>[39m[23m 1, NA, 2, 4, 3, 3, 3, NA, NA, 2, NA, 2…
$ fav_putin_2019                 [3m[38;5;246m<int>[39m[23m 4, NA, 4, 4, 3, 4, 4, NA, NA, 4, NA, 4…
$ fav_schumer_2019               [3m[38;5;246m<int>[39m[23m 1, NA, 1, 3, 4, 2, 2, NA, NA, 2, NA, 2…
$ fav_pelosi_2019                [3m[38;5;246m<int>[39m[23m 1, NA, 1, 3, 4, 1, 2, NA, NA, 1, NA, 2…
$ fav_comey_2019                 [3m[

In [26]:
# response variabe of interest
# 1 -> Donald Trump
# 2 -> Dem
# 3 -> no vote
# 4 -> not sure
# 8 -> skipped
str(data$vote2020_2019)
table(data$vote2020_2019)
y_col <- grep("vote2020_2019", colnames(data)) 
y <- data[,y_col]
y <- factor(y)
levels(y) <-  c("Donald_Trump", "Democrat", "No_Vote", "Not_Sure", "Skipped")    
data <- data[,-y_col]
col <- colnames(data)
data[col] <- lapply(data[col], as.factor) # due to survey design all variables are factor
data <- data.frame(data)

 NULL


< table of extent 0 >

In [23]:
head(data)
head(y)

Unnamed: 0_level_0,trumpapp_2019,fav_trump_2019,fav_obama_2019,fav_hrc_2019,fav_sanders_2019,fav_putin_2019,fav_schumer_2019,fav_pelosi_2019,fav_comey_2019,fav_mueller_2019,⋯,phone_baseline,faminc_baseline,cand_baseline,cdid_baseline,demnom2_baseline,demnom_t_baseline,izip_baseline,lookupzip_baseline,prim12_baseline,statefips_baseline
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,4.0,4.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,⋯,1,6,258342,3,1,,89052,89052,1,32
2,,,,,,,,,,,⋯,1,8,317635,6,3,,85298,85298,2,4
3,4.0,4.0,1.0,1.0,2.0,4.0,1.0,1.0,1.0,1.0,⋯,1,5,102825,6,2,Hillery Clinton,54904,54904,1,55
4,4.0,4.0,1.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,⋯,1,9,45897,1,1,,74104,74104,3,40
5,2.0,2.0,2.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,⋯,1,10,330742,21,3,,78148,78148,2,48
6,4.0,4.0,1.0,1.0,3.0,4.0,2.0,1.0,2.0,1.0,⋯,3,4,101855,12,1,,94061,94061,1,6


In [24]:
str(data)

'data.frame':	9548 obs. of  1269 variables:
 $ trumpapp_2019                   : Factor w/ 6 levels "1","2","3","4",..: 4 NA 4 4 2 4 4 NA NA 4 ...
 $ fav_trump_2019                  : Factor w/ 6 levels "1","2","3","4",..: 4 NA 4 4 2 4 4 NA NA 4 ...
 $ fav_obama_2019                  : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 1 2 1 1 NA NA 6 ...
 $ fav_hrc_2019                    : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 4 4 1 3 NA NA 1 ...
 $ fav_sanders_2019                : Factor w/ 6 levels "1","2","3","4",..: 1 NA 2 4 3 3 3 NA NA 2 ...
 $ fav_putin_2019                  : Factor w/ 6 levels "1","2","3","4",..: 4 NA 4 4 3 4 4 NA NA 4 ...
 $ fav_schumer_2019                : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 3 4 2 2 NA NA 2 ...
 $ fav_pelosi_2019                 : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 3 4 1 2 NA NA 1 ...
 $ fav_comey_2019                  : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 2 4 2 2 NA NA 3 ...
 $ fav_mueller_2019          

In [25]:
model.matrix(data)

ERROR: Error in terms.default(object): no terms component nor attribute


# 2. 

In [13]:
set.seed(123)
n <- nrow(data)
p = ncol(data) - 1
ind_train <- sample(x = 1:n, size = ceiling(0.8 * n))
set_train <- data[ind_train,]
ind_test <- setdiff(x = 1:n, ind_train)
set_test <- data[ind_test,]

In [79]:
X <- data
cols <- colnames(X)
X[cols] <- lapply(X[cols], as.factor)
str(X)

'data.frame':	9548 obs. of  1270 variables:
 $ vote2020_2019                   : Factor w/ 5 levels "1","2","3","4",..: 2 NA 2 2 1 2 2 NA NA 2 ...
 $ trumpapp_2019                   : Factor w/ 6 levels "1","2","3","4",..: 4 NA 4 4 2 4 4 NA NA 4 ...
 $ fav_trump_2019                  : Factor w/ 6 levels "1","2","3","4",..: 4 NA 4 4 2 4 4 NA NA 4 ...
 $ fav_obama_2019                  : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 1 2 1 1 NA NA 6 ...
 $ fav_hrc_2019                    : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 4 4 1 3 NA NA 1 ...
 $ fav_sanders_2019                : Factor w/ 6 levels "1","2","3","4",..: 1 NA 2 4 3 3 3 NA NA 2 ...
 $ fav_putin_2019                  : Factor w/ 6 levels "1","2","3","4",..: 4 NA 4 4 3 4 4 NA NA 4 ...
 $ fav_schumer_2019                : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 3 4 2 2 NA NA 2 ...
 $ fav_pelosi_2019                 : Factor w/ 6 levels "1","2","3","4",..: 1 NA 1 3 4 1 2 NA NA 1 ...
 $ fav_comey_2019            

# 3. Neural Net

In [56]:
nnet::nnet(x = data, y = y)

ERROR: Error in nnet.default(x = data, y = y): missing values in 'x'


# 4. LASSO

In [57]:
glmnet::glmnet(x = as.matrix(data), y = y, alpha = 1) # Elastic Net with alpha = 1

“NAs introduced by coercion”


ERROR: Error in storage.mode(y) <- "double": invalid to change the storage mode of a factor


# 5. Latent Drichilet Allocation

In [5]:
options(messages = 1)