## Finding missing geocodes in a voter file using Opencage for GA voter file.

In [1]:
#Load libraries/packages
library(tidyverse)
library(readr)
library(stringr)
library(base)
install.packages("opencage")
require(opencage)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.0
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Loading required package: opencage



In [5]:
#Set path for voter file data
path <- setwd("/srv/shared/BISG_datasets")

In [6]:
#Opencage key
opencage_key <- "faed2ce713894d20a5758e6b1ecdd62a"

In [7]:
#Load the voter file
voter_file <-read_csv(paste0(path, "/", "georgia_100k_sample.csv"))

Parsed with column specification:
cols(
  .default = col_character(),
  COUNTY_CODE = [32mcol_double()[39m,
  REGISTRATION_NUMBER = [32mcol_double()[39m,
  RESIDENCE_HOUSE_NUMBER = [32mcol_double()[39m,
  RESIDENCE_ZIPCODE = [32mcol_double()[39m,
  BIRTHDATE = [32mcol_double()[39m,
  REGISTRATION_DATE = [32mcol_double()[39m,
  DATE_ADDED = [32mcol_double()[39m,
  DATE_CHANGED = [32mcol_double()[39m,
  DISTRICT_COMBO = [32mcol_double()[39m,
  LAST_CONTACT_DATE = [32mcol_double()[39m,
  MAIL_HOUSE_NBR = [32mcol_double()[39m,
  MAIL_ZIPCODE = [32mcol_double()[39m,
  MAIL_ADDRESS_3 = [33mcol_logical()[39m,
  MAIL_COUNTRY = [33mcol_logical()[39m,
  lat = [32mcol_double()[39m,
  lon = [32mcol_double()[39m
)

See spec(...) for full column specifications.

“248 parsing failures.
 row                    col               expected actual                                                file
1573 RESIDENCE_HOUSE_NUMBER no trailing characters      B '/srv/shared/BISG_

In [8]:
# Subset a dataframe for voters with no longitude or latitude.
miss_latlon<-which(is.na(voter_file$lat) & is.na(voter_file$lon))

In [9]:
miss_latlon_df <- voter_file[miss_latlon,]

In [10]:
num_miss <- length(miss_latlon)
total_voters<-nrow(voter_file)-num_miss

In [11]:
total_geocoded <- round((total_voters-num_miss)/total_voters*100, 2)
message(paste(total_geocoded, "% of your voter file is now geocoded.", sep=" "))

99.95 % of your voter file is now geocoded.



In [14]:
if(num_miss>0){       
            message(paste("There are", num_miss, "voters missing latitude and longitude values in your voter geocoded voter file.\nWe will use Opencage to attempt to find them now.", sep=" "))
    
            #Let's take out the 47 people that missing geocodes and put in a dataframe
            miss_latlon_df$state <- str_sub(miss_latlon_df$full_addy,-2,-1)

            #Make final address for Opencage to identify

            miss_latlon_df$streetnum_name<- paste(miss_latlon_df$RESIDENCE_HOUSE_NUMBER, 
                                                      miss_latlon_df$RESIDENCE_STREET_NAME, sep=" ")

            miss_latlon_df$final_address<- paste(miss_latlon_df$streetnum_name,
                                                    miss_latlon_df$RESIDENCE_CITY,
                                                    miss_latlon_df$state,
                                                    miss_latlon_df$RESIDENCE_ZIPCODE, sep = ", ")
            
                      
            #Run these addresses through Opencage to identify latitude and longitude
            #Use opencage to find lat and lon for voters without geocodes and put back into ram dataframe.
                    suppressWarnings(
                        for(m in 1:nrow(miss_latlon_df)){
                        opencage_latlon<-opencage_forward(placename = miss_latlon_df$final_address[m], country="US", key=opencage_key)
                        miss_latlon_df$lon[m] <-opencage_latlon$results$geometry.lng
                        miss_latlon_df$lat[m] <-opencage_latlon$results$geometry.lat
                      }
                        )
            #Join the new_lon_lat_df dataframe and the missing_geo_city_zip dataframe.
            voter_file$lon[match(miss_latlon_df$REGISTRATION_NUMBER, voter_file$REGISTRATION_NUMBER)] <- miss_latlon_df$lon
            voter_file$lat[match(miss_latlon_df$REGISTRATION_NUMBER, voter_file$REGISTRATION_NUMBER)] <- miss_latlon_df$lat

            #Check for missing geocodes again
            miss_latlon2<-which(is.na(voter_file$lat) & is.na(voter_file$lon==0))
            num_miss2<-length(miss_latlon2)
            total_geocoded2<- round((total_voters-num_miss2)/total_voters*100, 2)
            
            message(paste(total_geocoded2, "% of your voter file is now geocoded using the Opencage API.", sep=" "))
            new_voter_file <- voter_file
        
    return(new_voter_file)  
} else {
    message(paste("There are no missing geocodes (i.e. latitudes and longitudes) in the voter file.", sep=" "))
}
    
    
 

There are 47 voters missing latitude and longitude values in your voter geocoded voter file.
We will use Opencage to attempt to find them now.

100 % of your voter file is now geocoded using the Opencage API.



full_addy,COUNTY_CODE,REGISTRATION_NUMBER,VOTER_STATUS,LAST_NAME,FIRST_NAME,MIDDLE_MAIDEN_NAME,NAME_SUFFIX,RESIDENCE_HOUSE_NUMBER,RESIDENCE_STREET_NAME,⋯,MAIL_STREET_NAME,MAIL_APT_UNIT_NBR,MAIL_CITY,MAIL_STATE,MAIL_ZIPCODE,MAIL_ADDRESS_2,MAIL_ADDRESS_3,MAIL_COUNTRY,lat,lon
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<lgl>,<lgl>,<dbl>,<dbl>
"MISSING ADDRESS , , GA",60,10356431,A,BOONE,DONTE,,,,,⋯,,,,,,,,,32.16562,-82.90008
"MISSING ADDRESS , , GA",25,12264902,A,MYERS,JOSH,DAVID,,8112,WHITE BLUFF RD,⋯,WHITE BLUFF RD,,SAVANNAH,GA,314063406,,,,32.16562,-82.90008
"MISSING ADDRESS , , GA",8,7764707,A,MCLENDON,ANTHONY,LEON,,269,MARKET PLACE BLVD,⋯,MARKET PLACE BLVD,#186,CARTERSVILLE,GA,301212235,,,,32.16562,-82.90008
"MISSING ADDRESS , , GA",48,11584512,A,MACK,NADIE',GENENE,,,,⋯,,,,,,,,,32.16562,-82.90008
"MISSING ADDRESS , , GA",137,2648120,A,LOUIS,ROBERT,LEE,,205,JOHN HOWARD WAY,⋯,JOHN HOWARD WAY,UNIT 599,TIFTON,GA,317936032,,,,32.16562,-82.90008
"MISSING ADDRESS , , GA",25,8041842,A,MYRICK,MYRA,REBECCA ANN,,250,MARTIN LUTHER KING BLVD,⋯,MARTIN LUTHER KING BLVD,,SAVANNAH,GA,314014247,,,,32.16562,-82.90008
"MISSING ADDRESS , , GA",47,11080050,A,THOMAS,JOHN,T,,,MISSING ADDRESS,⋯,,,,,,,,,32.16562,-82.90008
"SAPELO IS 31327, SAPELO ISLAND, GA",98,2315982,A,GOEKJIAN,VIRGINIA,H,,,SAPELO IS,⋯,PO BOX 95,,SAPELO ISLAND,GA,313270095,,,,31.41519,-81.25739
"1 14TH AVE NW 39828, CAIRO, GA",65,608907,A,HAMMETT,TINA,P,,1,14TH AVE NW,⋯,,,,,,,,,30.89177,-84.20798
"1 7TH ST 30901, AUGUSTA, GA",121,1437129,A,WOODRING,BARBARA,C,,1,7TH ST,⋯,,,,,,,,,33.47687,-81.96218
