Replicate Brooks et al 2003

Gopi Goteti edited this page Jan 21, 2014 · 5 revisions

Harold Brooks and others used data from the Storm Prediction Center to estimate the probability of occurrence of a tronado day near any location in the mainland United States (Brooks, Harold E., Charles A. Doswell III, and Michael P. Kay. "Climatological estimates of local daily tornado probability for the United States." Weather and Forecasting 18.4 (2003): 626-640.).

The below code produces numbers consistent, if not identical, to those from Brooks et al. The discrepancy could be due to the differences in Tornado data used (due to regular quality control the latest data used here may not be identical to the one used by SPC).

Read raw data add column names similar to those used by Elsner et al 2013 and consistent with documentation.

library(plyr)
library(ggplot2)
library(chron)
library(raster)
## Loading required package: sp
library(maptools)
## Checking rgeos availability: TRUE
library(rgdal)
## rgdal: version: 0.8-10, (SVN revision 478) Geospatial Data Abstraction
## Library extensions to R successfully loaded Loaded GDAL runtime: GDAL
## 1.9.2, released 2012/10/08 Path to GDAL shared files:
## C:/Users/ggoteti/Documents/R/win-library/3.0/rgdal/gdal GDAL does not use
## iconv for recoding strings. Loaded PROJ.4 runtime: Rel. 4.7.1, 23
## September 2009, [PJ_VERSION: 470] Path to PROJ.4 shared files:
## C:/Users/ggoteti/Documents/R/win-library/3.0/rgdal/proj
library(maps)

torn <- read.csv("data/1950-2012_torn.csv", header = FALSE, sep = ",", as.is = TRUE)

colnames(torn) <- c("OM", "YEAR", "MONTH", "DAY", "DATE", "TIME", "TIMEZONE", 
    "STATE", "FIPS", "STATENUMBER", "FSCALE", "INJURIES", "FATALITIES", "LOSS", 
    "CROPLOSS", "SLAT", "SLON", "ELAT", "ELON", "LENGTH", "WIDTH", "NS", "SN", 
    "SG", "F1", "F2", "F3", "F4")

Tornadoes spanning multiple counties are listed separately for each county. Thus, a single tornado could appear multiple times. Identify unique tornadoes using YEAR, OM and NS. Check for uniqueness; especially, tornadoes spanning multiple years (i.e, those which begin on 12/31 and end on 1/1); need to check only those with NS > 1.

dec31 <- subset(torn, MONTH == 12 & DAY == 31 & NS != 1)
jan01 <- subset(torn, MONTH == 1 & DAY == 1 & NS != 1)
if (nrow(dec31) > 0 & nrow(jan01) > 0) {
    stop("check! unique id assignment may not be accurate!")
}
torn$id <- paste(torn$YEAR, torn$MONTH, torn$OM, torn$NS, sep = "-")

Brooks et al used data from 1955-99 and excluded Alaska, Hawaii and Puerto Rico.

# torn <- subset(torn, YEAR %in% seq(1955, 1999))
torn <- subset(torn, !(STATE %in% c("AK", "HI", "PR")))

str(torn)
## 'data.frame':    58101 obs. of  29 variables:
##  $ OM         : int  1 1 1 2 3 4 5 6 7 8 ...
##  $ YEAR       : int  1950 1950 1950 1950 1950 1950 1950 1950 1950 1950 ...
##  $ MONTH      : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ DAY        : int  3 3 3 3 3 13 25 25 26 11 ...
##  $ DATE       : chr  "1950-01-03" "1950-01-03" "1950-01-03" "1950-01-03" ...
##  $ TIME       : chr  "11:00:00" "11:00:00" "11:10:00" "11:55:00" ...
##  $ TIMEZONE   : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ STATE      : chr  "MO" "MO" "IL" "IL" ...
##  $ FIPS       : int  29 29 17 17 39 5 29 17 48 48 ...
##  $ STATENUMBER: int  1 1 1 2 1 1 2 3 1 2 ...
##  $ FSCALE     : int  3 3 3 3 1 3 2 2 2 2 ...
##  $ INJURIES   : int  3 3 0 3 1 1 5 0 2 0 ...
##  $ FATALITIES : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ LOSS       : num  6 6 5 5 4 3 5 5 0 4 ...
##  $ CROPLOSS   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SLAT       : num  38.8 38.8 38.8 39.1 40.9 ...
##  $ SLON       : num  -90.2 -90.2 -90.1 -89.3 -84.6 ...
##  $ ELAT       : num  38.8 38.8 38.8 39.1 0 ...
##  $ ELON       : num  -90 -90.1 -90 -89.2 0 ...
##  $ LENGTH     : num  9.5 6.2 3.3 3.6 0.1 0.6 2.3 0.1 4.7 9.9 ...
##  $ WIDTH      : int  150 150 100 130 10 17 300 100 133 400 ...
##  $ NS         : int  2 2 2 1 1 1 1 1 1 1 ...
##  $ SN         : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ SG         : int  1 2 2 1 1 1 1 1 1 1 ...
##  $ F1         : int  0 189 119 135 161 113 93 91 47 39 ...
##  $ F2         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ F3         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ F4         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ id         : chr  "1950-1-1-2" "1950-1-1-2" "1950-1-1-2" "1950-1-2-1" ...
head(torn)
##   OM YEAR MONTH DAY       DATE     TIME TIMEZONE STATE FIPS STATENUMBER
## 1  1 1950     1   3 1950-01-03 11:00:00        3    MO   29           1
## 2  1 1950     1   3 1950-01-03 11:00:00        3    MO   29           1
## 3  1 1950     1   3 1950-01-03 11:10:00        3    IL   17           1
## 4  2 1950     1   3 1950-01-03 11:55:00        3    IL   17           2
## 5  3 1950     1   3 1950-01-03 16:00:00        3    OH   39           1
## 6  4 1950     1  13 1950-01-13 05:25:00        3    AR    5           1
##   FSCALE INJURIES FATALITIES LOSS CROPLOSS  SLAT   SLON  ELAT   ELON
## 1      3        3          0    6        0 38.77 -90.22 38.83 -90.03
## 2      3        3          0    6        0 38.77 -90.22 38.82 -90.12
## 3      3        0          0    5        0 38.82 -90.12 38.83 -90.03
## 4      3        3          0    5        0 39.10 -89.30 39.12 -89.23
## 5      1        1          0    4        0 40.88 -84.58  0.00   0.00
## 6      3        1          1    3        0 34.40 -94.37  0.00   0.00
##   LENGTH WIDTH NS SN SG  F1 F2 F3 F4         id
## 1    9.5   150  2  0  1   0  0  0  0 1950-1-1-2
## 2    6.2   150  2  1  2 189  0  0  0 1950-1-1-2
## 3    3.3   100  2  1  2 119  0  0  0 1950-1-1-2
## 4    3.6   130  1  1  1 135  0  0  0 1950-1-2-1
## 5    0.1    10  1  1  1 161  0  0  0 1950-1-3-1
## 6    0.6    17  1  1  1 113  0  0  0 1950-1-4-1
tail(torn)
##           OM YEAR MONTH DAY       DATE     TIME TIMEZONE STATE FIPS
## 58164 425349 2012    12  25 2012-12-25 20:09:00        3    AL    1
## 58165 425351 2012    12  25 2012-12-25 20:26:00        3    AL    1
## 58166 425352 2012    12  25 2012-12-25 20:32:00        3    AL    1
## 58167 425358 2012    12  25 2012-12-25 22:24:00        3    AL    1
## 58168 425359 2012    12  25 2012-12-25 22:29:00        3    AL    1
## 58169 421430 2012    12  26 2012-12-26 13:58:00        3    NC   37
##       STATENUMBER FSCALE INJURIES FATALITIES LOSS CROPLOSS  SLAT   SLON
## 58164           0      1        0          0 0.00        0 32.05 -86.85
## 58165           0      0        0          0 0.00        0 32.15 -86.68
## 58166           0      2        0          0 0.00        0 32.20 -86.60
## 58167           0      2        2          0 0.00        0 31.73 -86.15
## 58168           0      1        0          0 0.00        0 32.35 -86.05
## 58169           0      1        0          0 0.01        0 34.79 -76.67
##        ELAT   ELON LENGTH WIDTH NS SN SG  F1 F2 F3 F4               id
## 58164 32.07 -86.82   2.03   300  1  1  1  85  0  0  0 2012-12-425349-1
## 58165 32.16 -86.67   0.62    75  1  1  1  85  0  0  0 2012-12-425351-1
## 58166 32.31 -86.51   9.55   900  1  1  1  85  0  0  0 2012-12-425352-1
## 58167 31.88 -85.96  15.70   600  1  1  1 109  0  0  0 2012-12-425358-1
## 58168 32.36 -86.04   0.72   100  1  1  1 101  0  0  0 2012-12-425359-1
## 58169 34.80 -76.67   0.91   100  1  1  1  31  0  0  0 2012-12-421430-1

Function to summarize counts of unique number of tornadoes by year and month.

count_unique_tornadoes <- function(in_df, monthly_stats = TRUE) {

    require(plyr)

    if (monthly_stats) {
        # some months dont have data; assign NAs to those months
        mon_totals <- expand.grid(MONTH = seq(1, 12), stringsAsFactors = FALSE)

        # number of unique tornadoes per month
        mon_torn <- ddply(.data = in_df, .variables = .(MONTH), .fun = function(x_df) length(unique(x_df$id)), 
            .drop = FALSE)

        mon_totals <- merge(mon_totals, mon_torn, by = "MONTH", all = TRUE)

        # output matrix
        out_mat <- c(nrow(in_df), length(unique(in_df$id)), mon_totals$V1)
        out_mat <- matrix(out_mat, nrow = 1)
        colnames(out_mat) <- c("N_total", "N_unique", month.abb)
    } else {
        # output matrix
        out_mat <- c(nrow(in_df), length(unique(in_df$id)))
        out_mat <- matrix(out_mat, nrow = 1)
        colnames(out_mat) <- c("N_total", "N_unique")
    }

    return(out_mat)
}

Figure 1

Counts of tornadoes by year, similar to those in Figure 1 of Brooks et al.

event_stats <- ddply(.data = torn, .variables = .(YEAR), .fun = count_unique_tornadoes, 
    monthly_stats = FALSE)

str(event_stats)
## 'data.frame':    63 obs. of  3 variables:
##  $ YEAR    : int  1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 ...
##  $ N_total : int  206 261 246 443 570 604 518 867 576 612 ...
##  $ N_unique: int  201 260 240 422 550 591 504 857 564 602 ...
event_stats
##    YEAR N_total N_unique
## 1  1950     206      201
## 2  1951     261      260
## 3  1952     246      240
## 4  1953     443      422
## 5  1954     570      550
## 6  1955     604      591
## 7  1956     518      504
## 8  1957     867      857
## 9  1958     576      564
## 10 1959     612      602
## 11 1960     619      616
## 12 1961     721      697
## 13 1962     666      656
## 14 1963     474      461
## 15 1964     719      704
## 16 1965     916      895
## 17 1966     593      585
## 18 1967     937      924
## 19 1968     668      657
## 20 1969     615      606
## 21 1970     671      653
## 22 1971     899      884
## 23 1972     746      740
## 24 1973    1114     1101
## 25 1974     972      943
## 26 1975     923      918
## 27 1976     848      834
## 28 1977     861      852
## 29 1978     801      788
## 30 1979     865      855
## 31 1980     875      865
## 32 1981     786      782
## 33 1982    1063     1044
## 34 1983     938      930
## 35 1984     925      907
## 36 1985     703      683
## 37 1986     777      763
## 38 1987     662      656
## 39 1988     702      700
## 40 1989     871      855
## 41 1990    1151     1133
## 42 1991    1144     1132
## 43 1992    1312     1297
## 44 1993    1186     1173
## 45 1994    1094     1082
## 46 1995    1252     1235
## 47 1996    1180     1172
## 48 1997    1154     1148
## 49 1998    1440     1424
## 50 1999    1361     1336
## 51 2000    1077     1073
## 52 2001    1231     1214
## 53 2002     955      931
## 54 2003    1415     1374
## 55 2004    1836     1811
## 56 2005    1263     1259
## 57 2006    1139     1100
## 58 2007    1116     1097
## 59 2008    1737     1690
## 60 2009    1180     1154
## 61 2010    1315     1280
## 62 2011    1775     1690
## 63 2012     955      938

Graphic similar to Figure 1 of Brooks et al.

gfx_line <- ggplot(data = event_stats, aes(x = YEAR, y = N_unique))
gfx_line <- gfx_line + geom_point()
gfx_line <- gfx_line + ylim(c(0, 1600))
gfx_line <- gfx_line + stat_smooth(method = lm, se = FALSE)
gfx_line <- gfx_line + xlab("Year") + ylab("Tornadoes")
png("gfx_brooks2003_fig1.png")
plot(gfx_line)
## Warning: Removed 3 rows containing missing values (stat_smooth). Warning:
## Removed 3 rows containing missing values (geom_point).
garbage <- dev.off()

Figure 1

Figure 2

Number of torado days per year, similar to those in Figure 2 of Brooks et al.

day_stats <- unique(torn$DATE)
day_stats <- do.call("rbind", strsplit(day_stats, "-"))
head(day_stats)
##      [,1]   [,2] [,3]
## [1,] "1950" "01" "03"
## [2,] "1950" "01" "13"
## [3,] "1950" "01" "25"
## [4,] "1950" "01" "26"
## [5,] "1950" "02" "11"
## [6,] "1950" "02" "12"
day_stats <- as.data.frame(table(day_stats[, 1]), stringsAsFactors = FALSE)
colnames(day_stats) <- c("YEAR", "Days")

head(day_stats)
##   YEAR Days
## 1 1950   91
## 2 1951  108
## 3 1952   98
## 4 1953  136
## 5 1954  161
## 6 1955  150

Graphic similar to Figure 2 of Brooks et al.

gfx_line <- ggplot(data = day_stats, aes(x = YEAR, y = Days, group = 1))
gfx_line <- gfx_line + geom_point()
gfx_line <- gfx_line + ylim(c(0, 250))
gfx_line <- gfx_line + stat_smooth(method = lm, se = FALSE)
gfx_line <- gfx_line + xlab("Year") + ylab("Tornado Days")
png("gfx_brooks2003_fig2.png")
plot(gfx_line)
garbage <- dev.off()

Figure 2

Figure 3, Temporal Smoothing

Estimate probability of a tornado day for a leap year, consistent with Brooks et al. Estimate the number of a time a day of the year (doy) occurs in the period 1980-1999. Use 2012 as a reference leap year for the inclusion of Feb 29 (does not matter which leap year is chosen).

beg_year <- 1980
end_year <- 1999
torn_brooks <- subset(torn, YEAR >= beg_year & YEAR <= end_year)
torn_brooks <- subset(torn_brooks, !(SLAT == 0 | SLON == 0))

doy <- unique(torn_brooks$DATE)
doy <- substr(doy, 6, 10)
doy <- paste0("2012-", doy)
ref_Dec31_2011 <- as.numeric(as.Date("2011-12-31"))
doy_freq <- as.numeric(as.Date(doy)) - ref_Dec31_2011
str(doy_freq)
##  num [1:3587] 11 14 23 28 38 46 48 50 51 54 ...

Convert "doy _ freq" to probability and check for probs outside of 0-1 interval. Also prob for Feb 29 should be based on number of leap years during the data time period and not the entire time span, consistent with Brooks et al.

time_span <- length(beg_year:end_year)
doy_prob <- as.data.frame(table(doy_freq), stringsAsFactors = FALSE)
colnames(doy_prob) <- c("doy", "freq")
# fix for Feb 29, leap years in 1980-1999
num_leap <- sum(leap.year(beg_year:end_year))
doy_prob$span <- ifelse(doy_prob$doy != 60, time_span, num_leap)
doy_prob$prob <- doy_prob$freq/doy_prob$span

# add 0s for missing days
miss_days <- data.frame(doy = as.character(seq(1:366)), DATE = seq(as.Date("2012-01-01"), 
    as.Date("2012-12-31"), by = "day"), stringsAsFactors = FALSE)
doy_prob <- merge(doy_prob, miss_days, by = "doy", all = TRUE, sort = FALSE)
doy_prob$prob[is.na(doy_prob$prob)] <- 0
doy_prob$freq[is.na(doy_prob$freq)] <- 0
doy_prob$span[is.na(doy_prob$span)] <- time_span

# discard '2012'
doy_prob$DATE <- substr(doy_prob$DATE, 6, 10)

head(doy_prob)
##   doy freq span prob  DATE
## 1   1    3   20 0.15 01-01
## 2   2    2   20 0.10 01-02
## 3   3    4   20 0.20 01-03
## 4   4    6   20 0.30 01-04
## 5   5    2   20 0.10 01-05
## 6   6    3   20 0.15 01-06
summary(doy_prob)
##      doy                 freq           span         prob     
##  Length:366         Min.   : 0.0   Min.   : 5   Min.   :0.00  
##  Class :character   1st Qu.: 5.0   1st Qu.:20   1st Qu.:0.25  
##  Mode  :character   Median : 9.0   Median :20   Median :0.45  
##                     Mean   : 9.8   Mean   :20   Mean   :0.49  
##                     3rd Qu.:15.0   3rd Qu.:20   3rd Qu.:0.75  
##                     Max.   :20.0   Max.   :20   Max.   :1.00  
##      DATE          
##  Length:366        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Probability of a tornado day, unsmoothed, similar to Figure 3 of Brooks et al.

gfx_line <- ggplot(data = doy_prob, aes(x = as.numeric(doy), y = prob, group = 1))
gfx_line <- gfx_line + geom_point()
gfx_line <- gfx_line + xlab("Day of Year") + ylab("Probability")
# print(gfx_line)

Temporal smoothing, similar to Equation 1. Test temporal smoothing using sigma values of of 5 and 15 days. Make the data periodic to avoid problems with beginning and ending of data, cosistent with Brooks et al. Split the 366 days into two 183-day long portions. Add bottom 183-day portion to the top of the data and the top 183-day portion to the bottom of the data. The result is a 732-day long data. Apply frequency smoothing to the middle 366 days of this 732-day data.

doy_prob <- rbind(doy_prob[184:366, ], doy_prob, doy_prob[1:183, ])

sigma_t <- 5  # temporal smoother
time_wts5 <- exp(-0.5 * (c(-183:182)/sigma_t)^2)/(sqrt(2 * pi) * sigma_t)
# ensure weights sum to 1
sum(time_wts5)
## [1] 1
sigma_t <- 15  # temporal smoother
time_wts15 <- exp(-0.5 * (c(-183:182)/sigma_t)^2)/(sqrt(2 * pi) * sigma_t)
# ensure weights sum to 1
sum(time_wts15)
## [1] 1
doy_prob$freq_smooth5 <- 0
doy_prob$freq_smooth15 <- 0
for (eachDay in c(184:549)) {
    beg_index <- eachDay - 183
    end_index <- eachDay + 182
    doy_prob$freq_smooth5[eachDay] <- sum(doy_prob$freq[beg_index:end_index] * 
        time_wts5)
    doy_prob$freq_smooth15[eachDay] <- sum(doy_prob$freq[beg_index:end_index] * 
        time_wts15)
}

# discard half cycles at the top and bottom, added previously
doy_prob <- doy_prob[c(184:549), ]
# feb 29 span of 5 days does not apply after smoothing
doy_prob$prob_smooth5 <- doy_prob$freq_smooth5/time_span
doy_prob$prob_smooth15 <- doy_prob$freq_smooth15/time_span

summary(doy_prob)
##      doy                 freq           span         prob     
##  Length:366         Min.   : 0.0   Min.   : 5   Min.   :0.00  
##  Class :character   1st Qu.: 5.0   1st Qu.:20   1st Qu.:0.25  
##  Mode  :character   Median : 9.0   Median :20   Median :0.45  
##                     Mean   : 9.8   Mean   :20   Mean   :0.49  
##                     3rd Qu.:15.0   3rd Qu.:20   3rd Qu.:0.75  
##                     Max.   :20.0   Max.   :20   Max.   :1.00  
##      DATE            freq_smooth5   freq_smooth15    prob_smooth5  
##  Length:366         Min.   : 2.66   Min.   : 3.18   Min.   :0.133  
##  Class :character   1st Qu.: 4.83   1st Qu.: 4.82   1st Qu.:0.241  
##  Mode  :character   Median : 8.94   Median : 8.81   Median :0.447  
##                     Mean   : 9.80   Mean   : 9.80   Mean   :0.490  
##                     3rd Qu.:14.88   3rd Qu.:14.99   3rd Qu.:0.744  
##                     Max.   :18.54   Max.   :18.11   Max.   :0.927  
##  prob_smooth15  
##  Min.   :0.159  
##  1st Qu.:0.241  
##  Median :0.440  
##  Mean   :0.490  
##  3rd Qu.:0.750  
##  Max.   :0.905

Probability of a tornado day, similar to Figure 3 of Brooks et al.

gfx_line <- ggplot(data = doy_prob, aes(x = as.numeric(doy), y = prob, group = 1))
gfx_line <- gfx_line + geom_point()
gfx_line <- gfx_line + geom_line(aes(y = prob_smooth5, colour = "red"))
gfx_line <- gfx_line + geom_line(aes(y = prob_smooth15, colour = "blue"))
gfx_line <- gfx_line + theme(legend.position = "none")
gfx_line <- gfx_line + xlab("Day of Year") + ylab("Probability")
png("gfx_brooks2003_fig3.png")
plot(gfx_line)
garbage <- dev.off()

Figure 3

Figure 4, Spatial Smoothing

Select relevant data for spatial smoothing and subsequent analysis. Add temporal smoothed info to the torn 1980-99 dataset.

torn_brooks$DATE <- substr(torn_brooks$DATE, 6, 10)  # discard year
doy_prob <- doy_prob[, c("doy", "freq", "DATE", "freq_smooth15")]
torn_brooks <- merge(torn_brooks, doy_prob, by = "DATE", all = TRUE, sort = FALSE)
torn_brooks <- torn_brooks[, c("DATE", "SLAT", "SLON", "ELAT", "ELON", "LENGTH", 
    "id", "doy", "freq", "freq_smooth15")]
# convert doy from char to int
torn_brooks$doy <- as.numeric(torn_brooks$doy)

str(torn_brooks)
## 'data.frame':    20562 obs. of  10 variables:
##  $ DATE         : chr  "01-11" "01-11" "01-11" "01-11" ...
##  $ SLAT         : num  41.6 31.3 30.1 33 33.1 ...
##  $ SLON         : num  -83.6 -86.1 -96.9 -94.2 -95 ...
##  $ ELAT         : num  0 31.3 30.2 33 33.1 ...
##  $ ELON         : num  0 -86.1 -96.9 -94 -94.9 ...
##  $ LENGTH       : num  0.3 1 2.5 7 7.1 3.9 2.7 5.8 1 6.6 ...
##  $ id           : chr  "1980-1-1-1" "1996-1-15-1" "1998-1-697-1" "1998-1-6-1" ...
##  $ doy          : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ freq         : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ freq_smooth15: num  3.48 3.48 3.48 3.48 3.48 ...

Create a uniform grid of 80-km spanning the mainland US, consistent with Brooks et al. This 80-km grid will be in azimuthal equidistant projection. Assume 40 lat and -95 lon to be the reference for this new projection.

# lat-lon bounds of the lower 48 states
lat_seq <- c(20, 50)
lon_seq <- c(-125, -65)
ll_df <- expand.grid(lat = lat_seq, lon = lon_seq, KEEP.OUT.ATTRS = TRUE)

# lat-lon and azimuthal equidistant projection info
ll_proj <- "+proj=longlat +datum=WGS84"
ae_proj <- "+proj=aeqd +lat_0=35 +lon_0=-95 +units=m"

Function to project from geographic to aeqd. Input is a data frame and the name of the columns associated with lon and lat, and the input and output projection info for CRS.

Fn_Get_Projected_Locs <- function(in_df, lon_col, lat_col, in_proj, out_proj) {
    # create spatial data frame using sp library
    out_locs <- SpatialPointsDataFrame(coords = in_df[, c(lon_col, lat_col)], 
        data = in_df, proj = CRS(in_proj))

    # project lat-lons to aeqd, using rgdal's sptransform
    out_locs <- spTransform(out_locs, CRS(out_proj))

    return(out_locs)
}

Use above to identify the bounds of the 80-km grid and the coordinates.

ae_locs <- Fn_Get_Projected_Locs(ll_df, "lon", "lat", ll_proj, ae_proj)

# set the 80-km grid resolution and dimensions in aeqd
aegrid_res <- 80000  # raster resolution in meters 

aegrid_bounds <- apply(ae_locs@coords, 2, range)
aegrid_bounds
##           lon      lat
## [1,] -3142936 -1239770
## [2,]  3142936  2034440
aegrid_xcoords <- seq(aegrid_bounds[1, "lon"], aegrid_bounds[2, "lon"], aegrid_res)
aegrid_ycoords <- seq(aegrid_bounds[1, "lat"], aegrid_bounds[2, "lat"], aegrid_res)

aegrid_xcoords
##  [1] -3142936 -3062936 -2982936 -2902936 -2822936 -2742936 -2662936
##  [8] -2582936 -2502936 -2422936 -2342936 -2262936 -2182936 -2102936
## [15] -2022936 -1942936 -1862936 -1782936 -1702936 -1622936 -1542936
## [22] -1462936 -1382936 -1302936 -1222936 -1142936 -1062936  -982936
## [29]  -902936  -822936  -742936  -662936  -582936  -502936  -422936
## [36]  -342936  -262936  -182936  -102936   -22936    57064   137064
## [43]   217064   297064   377064   457064   537064   617064   697064
## [50]   777064   857064   937064  1017064  1097064  1177064  1257064
## [57]  1337064  1417064  1497064  1577064  1657064  1737064  1817064
## [64]  1897064  1977064  2057064  2137064  2217064  2297064  2377064
## [71]  2457064  2537064  2617064  2697064  2777064  2857064  2937064
## [78]  3017064  3097064
aegrid_ycoords
##  [1] -1239770 -1159770 -1079770  -999770  -919770  -839770  -759770
##  [8]  -679770  -599770  -519770  -439770  -359770  -279770  -199770
## [15]  -119770   -39770    40230   120230   200230   280230   360230
## [22]   440230   520230   600230   680230   760230   840230   920230
## [29]  1000230  1080230  1160230  1240230  1320230  1400230  1480230
## [36]  1560230  1640230  1720230  1800230  1880230  1960230
aeX <- length(aegrid_xcoords)
aeY <- length(aegrid_ycoords)

For each day identify the grids in the 80-km raster which experienced tornadoes. First, convert beginning and ending lat-lon values for each tornado segment to the equidistant projection. For some tornadoes both the beginning and ending lat-lon values are available. Interpolate between the beginning and ending coordinates and identify all the grids corresponding to these interpolated points.

Function below takes coordinates of two points (a "row" of the tornado data) and returns the 80-km grids associated with these two points and also those points lying on a straight line between them. The function returns a vector of strings, of the form "aa _ bb", where "aa" is the index in "x breaks" and "bb" is the index in "y breaks".

Fn_Identify_Grids <- function(torn_data) {
    begLat <- torn_data["SLAT"]
    begLon <- torn_data["SLON"]
    endLat <- torn_data["ELAT"]
    endLon <- torn_data["ELON"]

    torn_data <- data.frame(SLAT = begLat, SLON = begLon, ELAT = endLat, ELON = endLon)

    if (!(begLat == 0 | begLon == 0 | sum(any(is.na(torn_data))) > 0)) {

        out_list <- list()

        # indices of beginning lat-lon, corresp to 'SLON' & 'SLAT'
        sloc <- Fn_Get_Projected_Locs(torn_data, "SLON", "SLAT", ll_proj, ae_proj)
        sloc <- sloc@coords

        out_list <- c(out_list, paste0(findInterval(sloc[2], aegrid_ycoords), 
            "_", findInterval(sloc[1], aegrid_xcoords)))

        if (endLat != 0 & endLon != 0) {
            if (abs(begLat - endLat) > 0 | abs(begLon - endLon) > 0) {
                # indices of ending lat-lon, corresp to 'ELON' & 'ELAT'
                eloc <- Fn_Get_Projected_Locs(torn_data, "ELON", "ELAT", ll_proj, 
                  ae_proj)
                eloc <- eloc@coords

                out_list <- c(out_list, paste0(findInterval(eloc[2], aegrid_ycoords), 
                  "_", findInterval(eloc[1], aegrid_xcoords)))

                # indices of in-between points interpolate by making cuts at 5-km intervals
                if (eloc[2] > sloc[2]) {
                  bet_y <- seq(sloc[2], eloc[2], 5000)
                } else {
                  bet_y <- seq(sloc[2], eloc[2], -5000)
                }

                if (eloc[1] > sloc[1]) {
                  bet_x <- seq(sloc[1], eloc[1], 5000)
                } else {
                  bet_x <- seq(sloc[1], eloc[1], -5000)
                }
                bet_y <- findInterval(bet_y, aegrid_ycoords)
                bet_x <- findInterval(bet_x, aegrid_xcoords)
                bet_indices <- paste0(bet_y, "_", bet_x)
                out_list <- c(out_list, bet_indices)
            }
        }

        # identify unique indices
        out_list <- unlist(out_list, use.names = FALSE)
        out_list <- unique(out_list)

        return(out_list)
    } else {
        return(NA)
    }
}

Use the above function to identify the grids with tornado occurrences for each day of the year. Assign the temporally smoothed frequency, for that day, to each grid.

space_raw <- array(data = 0, dim = c(aeY, aeX, 366))
for (eachDoy in c(1:366)) {
    # data for each doy
    torn_doy <- subset(torn_brooks, doy == eachDoy)
    torn_doy <- as.matrix(torn_doy[, c("SLAT", "SLON", "ELAT", "ELON", "freq_smooth15")])

    # grids corresp to all lat-lon pairs for the doy
    doy_grids <- apply(torn_doy, 1, FUN = Fn_Identify_Grids)
    if (sum(any(is.na(doy_grids))) == 0) {
        doy_grids <- unique(unlist(doy_grids, use.names = FALSE))

        # convert these indices to locations in the lat-lon matrix
        ll_index <- do.call("rbind", strsplit(doy_grids, "_"))
        ll_index <- as.matrix(ll_index)
        ll_index <- apply(ll_index, c(1, 2), FUN = as.numeric)

        # sometimes, the y-index ('lat') is 0 - because aeqd system is either quirky
        # or the transformation is not properly done or i am doing something wrong
        # for now, assign 1 to these 0 values
        ll_index[, 1] <- ifelse(ll_index[, 1] < 1, 1, ll_index[, 1])

        # populate the parent array
        for (eachRow in c(1:nrow(ll_index))) {
            space_raw[ll_index[eachRow, 1], ll_index[eachRow, 2], eachDoy] <- unique(torn_doy[, 
                "freq_smooth15"])
        }
    }
}

Calculate spatial weights, using Equation 2. This equation in the paper is incomplete! - the gaussian 2-D KDE integral was represented as a double integral, but the delta x and delta y terms for the I and J directions have been left out. Also in equation 2 of the paper, the left hand side should be f subscript x,y,n and not probability p.

Check the KDE weights for select grids in the middle of the grid and near or along the edges and ensure they add up to 1.

# Function to compute the euclidean distance between 2 points
Fn_Compute_Distance <- function(y1, x1, y2, x2) {
    return(sqrt((y1 - y2)^2 + (x1 - x2)^2))
}

# matrices used in distance calcs
xindx_mat <- matrix(rep(c(1:aeX), aeY), nrow = aeY, byrow = TRUE)
yindx_mat <- matrix(rep(c(1:aeY), aeX), nrow = aeY, byrow = FALSE)

aegrid_res_km <- aegrid_res/1000  # grid resolution in km
sigma_x <- 120  # spatial smoother in km

for (eachRow in c(1, 2, 5, 15, aeY)) {
    for (eachCol in c(1, 2, 5, 38, aeX)) {
        # calculate distance matrix
        dist_mat <- aegrid_res_km * Fn_Compute_Distance(yindx_mat, xindx_mat, 
            eachRow, eachCol)
        # calculate sum of weights
        space_wts <- exp(-0.5 * (dist_mat/sigma_x)^2) * (aegrid_res_km^2)/(2 * 
            pi * (sigma_x^2))

        cat("Y = ", eachRow, ", X = ", eachCol, ", sum = ", round(sum(space_wts), 
            2), "\n")
    }
}
## Y =  1 , X =  1 , sum =  0.4 
## Y =  1 , X =  2 , sum =  0.54 
## Y =  1 , X =  5 , sum =  0.63 
## Y =  1 , X =  38 , sum =  0.63 
## Y =  1 , X =  79 , sum =  0.4 
## Y =  2 , X =  1 , sum =  0.54 
## Y =  2 , X =  2 , sum =  0.72 
## Y =  2 , X =  5 , sum =  0.84 
## Y =  2 , X =  38 , sum =  0.85 
## Y =  2 , X =  79 , sum =  0.54 
## Y =  5 , X =  1 , sum =  0.63 
## Y =  5 , X =  2 , sum =  0.84 
## Y =  5 , X =  5 , sum =  1 
## Y =  5 , X =  38 , sum =  1 
## Y =  5 , X =  79 , sum =  0.63 
## Y =  15 , X =  1 , sum =  0.63 
## Y =  15 , X =  2 , sum =  0.85 
## Y =  15 , X =  5 , sum =  1 
## Y =  15 , X =  38 , sum =  1 
## Y =  15 , X =  79 , sum =  0.63 
## Y =  41 , X =  1 , sum =  0.4 
## Y =  41 , X =  2 , sum =  0.54 
## Y =  41 , X =  5 , sum =  0.63 
## Y =  41 , X =  38 , sum =  0.63 
## Y =  41 , X =  79 , sum =  0.4

The weights add up to 1 for most of the grids, except the ones on or very close to the edge of the grids. For these grids, scale the weights such that their sum is 1.

space_smooth <- array(data = 0, dim = c(aeY, aeX, 366))

for (eachRow in 1:aeY) {
    for (eachCol in 1:aeX) {
        # calculate distance matrix
        dist_mat <- aegrid_res_km * Fn_Compute_Distance(yindx_mat, xindx_mat, 
            eachRow, eachCol)
        # spatial smoothing, equation 2
        for (eachDoy in c(1:366)) {
            # calculate weights
            space_wts <- exp(-0.5 * (dist_mat/sigma_x)^2) * (aegrid_res_km^2)/(2 * 
                pi * (sigma_x^2))
            # ensure weights add up to 1
            space_wts <- space_wts/sum(space_wts)

            # smooth frequencies
            space_smooth[eachRow, eachCol, eachDoy] <- sum(space_raw[, , eachDoy] * 
                space_wts)
        }
    }
}

Some diagnostic plots on smoothed spatial frequency.

# map of the lower 48 in aeqd
usa_map <- map("state", xlim = range(lon_seq), ylim = range(lat_seq), plot = FALSE)
usa_map <- map2SpatialLines(usa_map)
proj4string(usa_map) <- CRS(ll_proj)
usa_map <- spTransform(usa_map, CRS(ae_proj))

Fn_Draw_Spatial_Freq <- function(doy, gfx_name, smoothed = TRUE, cumulative = FALSE) {
    if (cumulative) {
        doy_rast <- apply(space_smooth[, , c(1:doy)], c(1, 2), FUN = sum)
        if (!smoothed) {
            doy_rast <- apply(space_raw[, , c(1:doy)], c(1, 2), FUN = sum)
        }
    } else {
        doy_rast <- space_smooth[, , doy]
        if (!smoothed) {
            doy_rast <- space_raw[, , doy]
        }
    }

    # flip the matrix from S-N to N-S to counteract 'raster' package behavior
    doy_rast <- doy_rast[c(nrow(doy_rast):1), ]
    # plot tornado days per year for cumulative
    if (cumulative) {
        doy_rast <- doy_rast/366
        # if tornado days are less than 0.25, set to NA
        doy_rast[doy_rast < 0.25] <- NA
    }

    doy_rast <- raster(doy_rast, xmn = min(aegrid_xcoords), xmx = max(aegrid_xcoords), 
        ymn = min(aegrid_ycoords), ymx = max(aegrid_ycoords), crs = ae_proj)

    png(paste0(gfx_name, ".png"), width = ncol(doy_rast) * 10, height = nrow(doy_rast) * 
        10)
    if (cumulative) {
        leg_breaks <- c(0.25, seq(0.5, 3, 0.5))
        plot(doy_rast, breaks = leg_breaks, col = rev(rainbow(12)), lab.breaks = leg_breaks, 
            zlim = range(leg_breaks), axes = FALSE, main = paste0("Number of Tornadoes Per Year, ", 
                beg_year, "-", end_year))
    } else {
        plot(doy_rast, axes = FALSE)
    }
    plot(usa_map, add = TRUE)
    if (smoothed) {
        if (cumulative) {
            contour(doy_rast, levels = leg_breaks, add = TRUE, axes = FALSE)
        } else {
            contour(doy_rast, add = TRUE, axes = FALSE)
        }
    }
    garbage <- dev.off()
}

# tornado days per year
Fn_Draw_Spatial_Freq(366, "gfx_brooks2003_fig4", smoothed = TRUE, cumulative = TRUE)

# # jun 1 Fn_Draw_Spatial_Freq(153, 'doy_indiv_smooth', smoothed = TRUE,
# cumulative = FALSE) Fn_Draw_Spatial_Freq(153, 'doy_indiv_raw', smoothed =
# FALSE, cumulative = FALSE)

Figure 4