## R script for OMOP to load Toxic Release Inventory data

### Setup

In [8]:
library(DBI)

# make db connection
# for format of database file see db/env/db_conf.txt
db <- read.delim( '../../db/env/feta.txt', header=TRUE, sep=' ' )
con <- dbConnect(RPostgres::Postgres(),
                 dbname = toString(db$database[1]),
                 host = toString(db$host[1]),
                 port = 5432,
                 user = toString(db$user),
                 password = toString(db$pass))

# check the connection
dbListTables(con)

### load TRI data

In [4]:
tridata <- read.csv('../../../../tri_2018_us.csv')

In [None]:
#tridata[1:50,10:30]

In [9]:
# function to populate names, types and units in respective attribute tables
loadvocab <- function(table,words) {
    sql <- paste("select count(*) from ",table," where name like $1")
    res <- dbSendQuery(con,sql)
    dbBind(res, list(words[1]))
    found <- dbFetch(res)[1,1]
    dbClearResult(res)
    if (found == 0) {
        for (word in words) {
            sql <- paste("insert into ",table," (name) values($1) returning id")
            res <- dbSendQuery(con,sql)
            dbBind(res, list(word))
            dbClearResult(res)
        }
    }
    sql <- paste("select * from ",table)
    res <- dbSendQuery(con,sql)
    vocab <- dbFetch(res)
    dbClearResult(res)
    return (vocab)
}

In [10]:
# get all units, location types, and so on
attunits <- unique(tridata[,39])
levels(attunits) = c(levels(attunits),c('Boolean'))
attunits[3] <- "Boolean"
attunits <- loadvocab('hz_att_unit',attunits)
#attunits
loctypes <- unique(tridata[,16])
loctypes <- loadvocab('hz_type',loctypes)
#loctypes
attnames <- unique(tridata[,30])
attnames <- loadvocab('hz_att_name',attnames)
#attnames
attcats <- c('Carcinogen','Metal','Carcinogen|Metal')
attcats <- loadvocab('hz_att_category',attcats)
#attcats

"Factors converted to character"
"Factors converted to character"
"Factors converted to character"


### Put TRI data into database 

In [20]:
# get TRIFDs only for Florida TRI data
flIDs <- subset(tridata, X8..ST=="FL",select=X2..TRIFD)
locIDs <- unique(flIDs[,1])

In [None]:
# this is for first schema by UM group
for (locID in locIDs) {
    
    # get all attributes for one location
    locAttrs <- subset(tridata, X2..TRIFD==locID)
    print(locID)
    
    # get location and description from first line of TRI attributes where the TRIFD is constrained
    hzpname <- paste(locAttrs[1,]$X4..FACILITY.NAME)
    hzpdesc <- paste(locAttrs[1,]$X5..STREET.ADDRESS,locAttrs[1,]$X6..CITY,locAttrs[1,]$X8..ST,locAttrs[1,]$X9..ZIP)
    hzplat <- locAttrs[1,][1,12]
    hzplon <- locAttrs[1,][1,13]
    hzptype <- loctypes[which(loctypes$name==locAttrs[1,][16]$X16..INDUSTRY.SECTOR),]$id
    hzpsource <- 1

    # create a point from the first line of TRI attributes where the TRIFD is constrained
    sql <- 'insert into hazard_point ("name", "desc", "geom", "hz_type_id", "hz_source_id") 
            values ($1,$2,ST_MakePoint($3,$4),$5,$6)
            returning id'
    res <- dbSendQuery(con,sql)
    dbBind(res, list(hzpname,hzpdesc,hzplon,hzplat,hzptype,hzpsource))
    hazard_point_ID = dbFetch(res)[1,1]
    dbClearResult(res)
    #hazard_point_ID = 1
    #print(paste(hzpname,hzpdesc,hzplon,hzplat,hzpsource))

    # step through attributes from this TRIFD and add to attributes table
    apply(locAttrs, 1, function(vec) {
        hz_category_id <- NA
        if (vec[35] == "YES" & vec[37] == "YES") {
            hz_category_id <- 3
        } else if (vec[35] == "YES") {
            hz_category_id <- 2
        } else if (vec[37] == "YES") {
            hz_category_id <- 1
        }
        hz_unit_id = attunits[which(attunits$name==vec[39]),]$id
        hz_name_id = attnames[which(attnames$name==vec[30]),]$id
        sql <- "insert into hz_attribute (hz_point_id,hz_category_id,hz_unit_id,hz_name_id,value) 
                    values($1,$2,$3,$4,$5)"
        res <- dbSendQuery(con,sql)
        #NOTE this is a major simplication of the TRI data - only the On-Site Relese Total as the value
        dbBind(res, list(hazard_point_ID,hz_category_id,hz_unit_id,hz_name_id,vec[54]))
        dbClearResult(res)
        #print(paste(hazard_point_ID,hz_category_id,hz_unit_id,hz_name_id,vec[54]))
    })
}

In [22]:
# this is for second schema by OMOP group
geo_table <- 'geo_florida_tri_2018'
att_table <- 'attr_florida_tri_2018'
local_epsg <- 32618
for (locID in locIDs) {
    
    # get all attributes for one location
    locAttrs <- subset(tridata, X2..TRIFD==locID)
    print(locID)
    
    # get location and description from first line of TRI attributes where the TRIFD is constrained
    name <- paste(locAttrs[1,]$X4..FACILITY.NAME)
    source_id_coding <- 'EPA Address Geocode'
    source_id_value <- paste(locAttrs[1,]$X5..STREET.ADDRESS,locAttrs[1,]$X6..CITY,locAttrs[1,]$X8..ST,locAttrs[1,]$X9..ZIP)
    lat <- locAttrs[1,][1,12]
    lon <- locAttrs[1,][1,13]
    type <- loctypes[which(loctypes$name==locAttrs[1,][16]$X16..INDUSTRY.SECTOR),]$id # industry sector
    hzpsource <- 1

    # create a point from the first line of TRI attributes where the TRIFD is constrained
    sql <- paste('insert into ',geo_table,' ("name", "source_id_coding", "source_id_value", "geom_wgs84", "geom_local") 
            values ($1,$2,$3,ST_SetSRID(ST_MakePoint($4,$5),4326),ST_SetSRID(ST_MakePoint($4,$5),',local_epsg,'))
            returning geo_record_id')
    res <- dbSendQuery(con,sql)
    dbBind(res, list(name,source_id_coding,source_id_value,lon,lat))
    geo_record_ID = dbFetch(res)[1,1]
    dbClearResult(res)
    #hazard_point_ID = 1
    #print(paste(hzpname,hzpdesc,hzplon,hzplat,hzpsource))
    
    # insert location industry sector as atttribute
    sql <- paste("insert into ",att_table," (geo_record_id,attr_concept_id,value_as_concept_id) 
                    values($1,$2,$3)")
    attr_concept_id <- 507
    res <- dbSendQuery(con,sql)
    dbBind(res, list(geo_record_ID,attr_concept_id,type))
    dbClearResult(res)
    
    # set up insert sql for rest of attributes
    sql <- paste("insert into ",att_table," (geo_record_id,attr_concept_id,value_as_number,unit_concept_id,qualifier_concept_id) 
                    values($1,$2,$3,$4,$5)")

    # step through attributes from this TRIFD and add to attributes table
    apply(locAttrs, 1, function(vec) {
        att_qualifier_id <- NA #carcinogen, metal, or both
        if (vec[35] == "YES" & vec[37] == "YES") { # both
            att_qualifier_id <- 3
        } else if (vec[35] == "YES") { # metal
            att_qualifier_id <- 2
        } else if (vec[37] == "YES") { # carcinogen
            att_qualifier_id <- 1
        }
        #NOTE this is a major simplication of the TRI data - only the On-Site Relese Total as the value
        att_unit_id <- attunits[which(attunits$name==vec[39]),]$id # unit of measure mapped to id in hz_att_unit table
        attr_concept_id <- attnames[which(attnames$name==vec[30]),]$id # name of chemical mapped to id in hz_att_name table
        attr_value_as_number <- vec[54] # An estimate of the total quantity of the chemical released to on-site landfills
        res <- dbSendQuery(con,sql)
        dbBind(res, list(geo_record_ID,attr_concept_id,attr_value_as_number,att_unit_id,att_qualifier_id))
        dbClearResult(res)
        #print(paste(hazard_point_ID,hz_category_id,hz_unit_id,hz_name_id,vec[54]))
    })
}

[1] "33801SNJNF2302L"
[1] "32771CRPPP31BRW"
[1] "33566CNTRL10MIL"
[1] "32206WNSCR1035T"
[1] "32333CSTLLHIGHW"
[1] "33567SPRBR3304S"
[1] "3388WPLYGL1231A"
[1] "3383WFLRKN244CM"
[1] "32347BCKYCROUTE"
[1] "33686PRTTM6500C"
[1] "34761WHTKR280EN"
[1] "32653GNSVL10001"
[1] "32403SDDTYUSHWY"
[1] "32226STJHN11201"
[1] "3217WPRTCN11WAS"
[1] "32831STNTN5100S"
[1] "34946SYCHT3901S"
[1] "3340WGRNCH222CL"
[1] "32824MRCNC685RO"
[1] "32218NHSRB111BU"
[1] "32055PRNML1575L"
[1] "32674PNMTC4647S"
[1] "34208TRPCN10011"
[1] "33605TMPFL425S2"
[1] "32713FLRDP176WH"
[1] "33810KYMRK2540K"
[1] "32205MTLCN1100N"
[1] "32533MNSNT3000O"
[1] "33556NCCHM1725G"
[1] "33534NWNGC12949"
[1] "33605NTRNT1616P"
[1] "33860WSTVCPOBOX"
[1] "32544SRFRC16CES"
[1] "33865FLRDFHWY64"
[1] "34232PTRSN155CA"
[1] "3380WTHLNC335RE"
[1] "34950PRTCN121NJ"
[1] "33430SGRSP1281S"
[1] "33069WRGRC1200N"
[1] "3331WMTVNT15SE2"
[1] "32083PRDFRRIVER"
[1] "32034CNTNRNORTH"
[1] "32501RCHHL407SO"
[1] "32935DRSPT100NB"
[1] "33873VNDLH2394V"
[1] "32571