## Step 10: Gather Source Datasets

Purpose: Download and stage required datasets into the project **source** geodatabase.  

Current list of resources:

- [CRWU_CREAT_Grid_Projections](https://services.arcgis.com/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Grid_Projections/FeatureServer/0) from EPA Geoplatform
- [CRWU_CREAT_Historic_Climate_Stations](https://services.arcgis.com/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Historic_Climate_Stations/FeatureServer/0) from EPA Geoplatform
- [COOP_STATIONS_TO_USE](https://github.com/barrc/get_ncei/blob/main/src/coop_stations_to_use.csv) from barrc GitHub
- [ISD_STATIONS_TO_USE](https://github.com/barrc/get_ncei/blob/main/src/isd_stations_to_use.csv) from barrc Github
- [TEMPORAL_DIST_FILE](https://github.com/barrc/extreme_events/blob/main/temporal_dist_file.txt) from barrc GitHub
- [Census States](https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/State_County/MapServer/0) from US Census Tigerweb


In [1]:
print("Executing Step 10: Gather Source Datasets");

# Load common utilities
%run ./swcutil.ipynb

rez = swc_resources();


Executing Step 10: Gather Source Datasets


### 10.010: Download CRWU_CREAT_Grid_Projections from EPA Geoplatform

In [2]:
%%time

f = IntProgress(min=0,max=3);
display(f);

host = "services.arcgis.com";
path = "/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Grid_Projections/FeatureServer/0";
fc   = "CRWU_CREAT_Grid_Projections";

if arcpy.Exists(rez['source'] + os.sep + fc):
    arcpy.Delete_management(rez['source'] + os.sep + fc);
f.value +=1;

z = scrape_ags(host,path,rez['source'],fc);
f.value +=1;

z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'CREAT_ID'
    ,index_name = 'CREAT_ID_IDX'
);
f.value +=1;

z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'GRIDCODE'
    ,index_name = 'GRIDCODE_IDX'
);
f.value +=1;


IntProgress(value=0, max=3)

  pulling records where OBJECTID >= 1 AND OBJECTID <= 2000
  pulling records where OBJECTID >= 2001 AND OBJECTID <= 4000
  pulling records where OBJECTID >= 4001 AND OBJECTID <= 6000
  pulling records where OBJECTID >= 6001 AND OBJECTID <= 8000
  pulling records where OBJECTID >= 8001 AND OBJECTID <= 10000
  pulling records where OBJECTID >= 10001 AND OBJECTID <= 12000
  pulling records where OBJECTID >= 12001 AND OBJECTID <= 14000
  pulling records where OBJECTID >= 14001 AND OBJECTID <= 16000
  pulling records where OBJECTID >= 16001 AND OBJECTID <= 18000
  pulling records where OBJECTID >= 18001 AND OBJECTID <= 20000
  pulling records where OBJECTID >= 20001 AND OBJECTID <= 22000
  pulling records where OBJECTID >= 22001 AND OBJECTID <= 24000
  pulling records where OBJECTID >= 24001 AND OBJECTID <= 24743
  Scrape complete.
Wall time: 4min 53s


### 10.020: Download CRWU_CREAT_Historic_Climate_Stations from EPA Geoplatform

In [3]:
%%time

f = IntProgress(min=0,max=3);
display(f);

host = "services.arcgis.com";
path = "/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Historic_Climate_Stations/FeatureServer/0";
fc   = "CRWU_CREAT_Historic_Climate_Stations";

if arcpy.Exists(rez['source'] + os.sep + fc):
    arcpy.Delete_management(rez['source'] + os.sep + fc);
f.value +=1;

z = scrape_ags(host,path,rez['source'],fc);
f.value +=1;

z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'CLIMATE_STATION_PK_ID'
    ,index_name = 'CLIMATE_STATION_PK_ID_IDX'
);
f.value +=1;

z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'NOAA_STATION_ID'
    ,index_name = 'NOAA_STATION_ID_IDX'
);
f.value +=1;


IntProgress(value=0, max=3)

  pulling records where OBJECTID >= 1 AND OBJECTID <= 2000
  pulling records where OBJECTID >= 2001 AND OBJECTID <= 4000
  pulling records where OBJECTID >= 4001 AND OBJECTID <= 6000
  pulling records where OBJECTID >= 6001 AND OBJECTID <= 8000
  pulling records where OBJECTID >= 8001 AND OBJECTID <= 10000
  pulling records where OBJECTID >= 10001 AND OBJECTID <= 11165
  Scrape complete.
Wall time: 14.4 s


### 10.030: Download COOP_STATIONS_TO_USE dataset from barrc GitHub repository

In [4]:
%%time

f = IntProgress(min=0,max=8);
display(f);

url = "https://raw.githubusercontent.com/barrc/get_ncei/master/src/coop_stations_to_use.csv"
fc  = 'COOP_STATIONS_TO_USE';

tmptab = rez['qa'] + os.sep + 'coop_stations_to_use.csv';
z = downloadtab(url,tmptab);
f.value +=1;

fms = arcpy.FieldMappings();
fms.addFieldMap(fmtext  (tmptab,'station_id',255));
fms.addFieldMap(fmtext  (tmptab,'station_name',255));
fms.addFieldMap(fmtext  (tmptab,'state',255));
fms.addFieldMap(fmtext  (tmptab,'start_date',255));
fms.addFieldMap(fmtext  (tmptab,'end_date',255));
fms.addFieldMap(fmdouble(tmptab,'latitude'));
fms.addFieldMap(fmdouble(tmptab,'longitude'));
fms.addFieldMap(fmtext  (tmptab,'in_basins',255));
fms.addFieldMap(fmtext  (tmptab,'break_with_basins',255));
fms.addFieldMap(fmtext  (tmptab,'network',255));
fms.addFieldMap(fmtext  (tmptab,'start_date_to_use',255));
fms.addFieldMap(fmtext  (tmptab,'end_date_to_use',255));
f.value +=1;

z = tab2fc(tmptab,rez['source'],fc,'longitude','latitude',fms);
f.value +=1;

print("  add quotes to start and end fields");
cb_cleanDate = """
def cleanDate(pin):
    (mm,dd,yyyy) = pin.split('/');
    if mm in ['1','2','3','4','5','6','7','8','9']:
       mm = '0' + mm;
    if dd in ['1','2','3','4','5','6','7','8','9']:
       dd = '0' + dd;
    return "'" + yyyy + "/" + mm + "/" + dd + "'";
    
""";

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'start_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'start_date_clean'
    ,expression      = "cleanDate(!start_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);
f.value +=1;

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'end_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'end_date_clean'
    ,expression      = "cleanDate(!end_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);
f.value +=1;

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'start_date_to_use_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_to_use_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'start_date_to_use_clean'
    ,expression      = "cleanDate(!start_date_to_use!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);
f.value +=1;

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'end_date_to_use_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_to_use_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'end_date_to_use_clean'
    ,expression      = "cleanDate(!end_date_to_use!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);
f.value +=1;

print("  calculating year count");
cb_yearCount = """
import datetime;
def yearCount(pstart,pend):
    d1 = datetime.datetime.strptime(pstart,"%m/%d/%Y");
    d2 = datetime.datetime.strptime(pend  ,"%m/%d/%Y");
    yr = round((d2 - d1).days / 365);
    return yr + 0.0;
    
""";

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'year_count'
    ,field_type   = 'Double'
    ,field_alias  = 'year_count'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'year_count'
    ,expression      = 'yearCount(!start_date_to_use!,!end_date_to_use!)'
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_yearCount
);
f.value +=1;

print("  adding indexes");
z = arcpy.management.AddIndex(
     in_table      = rez['source']+ os.sep + fc
    ,fields        = 'station_id'
    ,index_name    = 'station_id_IDX'
);
f.value +=1;


IntProgress(value=0, max=8)

  downloading file
  loading to table
  converting to NAD83 points
  add quotes to start and end fields
  calculating year count
  adding indexes
Wall time: 12.1 s


### 10.040: Download ISD_STATIONS_TO_USE dataset from barrc GitHub repository

In [5]:
%%time

f = IntProgress(min=0,max=6);
display(f);

url = "https://raw.githubusercontent.com/barrc/get_ncei/master/src/isd_stations_to_use.csv"
fc  = 'ISD_STATIONS_TO_USE';

tmptab = rez['qa'] + os.sep + 'isd_stations_to_use.csv';
z = downloadtab(url,tmptab);
f.value +=1;

fms = arcpy.FieldMappings();
fms.addFieldMap(fmtext  (tmptab,'station_id',255));
fms.addFieldMap(fmtext  (tmptab,'station_name',255));
fms.addFieldMap(fmtext  (tmptab,'state',255));
fms.addFieldMap(fmtext  (tmptab,'start_date',255));
fms.addFieldMap(fmtext  (tmptab,'end_date',255));
fms.addFieldMap(fmdouble(tmptab,'latitude'));
fms.addFieldMap(fmdouble(tmptab,'longitude'));
fms.addFieldMap(fmtext  (tmptab,'in_basins',255));
fms.addFieldMap(fmtext  (tmptab,'break_with_basins',255));
fms.addFieldMap(fmtext  (tmptab,'network',255));
f.value +=1;

z = tab2fc(tmptab,rez['source'],fc,'longitude','latitude',fms);
f.value +=1;

print("  add quotes to start and end fields");
z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'start_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'start_date_clean'
    ,expression      = "cleanDate(!start_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);
f.value +=1;

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'end_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'end_date_clean'
    ,expression      = "cleanDate(!end_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);
f.value +=1;

print("  calculating year count");
z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'year_count'
    ,field_type   = 'Double'
    ,field_alias  = 'year_count'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'year_count'
    ,expression      = 'yearCount(!start_date!,!end_date!)'
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_yearCount
);
f.value +=1;

print("  adding indexes");
z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'station_id'
    ,index_name = 'station_id_IDX'
);
f.value +=1;


IntProgress(value=0, max=6)

  downloading file
  loading to table
  converting to NAD83 points
  add quotes to start and end fields
  calculating year count
  adding indexes
Wall time: 9.08 s


### 10.050: Download TEMPORAL_DIST_FILE dataset from barrc GitHub repository

In [6]:
%%time

f = IntProgress(min=0,max=3);
display(f);

url = "https://raw.githubusercontent.com/barrc/extreme_events/main/temporal_dist_file.txt"
fc  = 'TEMPORAL_DIST_FILE';

tmptab = rez['qa'] + os.sep + 'temporal_dist_file.tab';
z = downloadtab(url,tmptab);
f.value +=1;

fms = arcpy.FieldMappings();
fms.addFieldMap(fmdouble(tmptab,'Time'));
fms.addFieldMap(fmdouble(tmptab,'CA_1'));
fms.addFieldMap(fmdouble(tmptab,'CA_2'));
fms.addFieldMap(fmdouble(tmptab,'CA_3'));
fms.addFieldMap(fmdouble(tmptab,'CA_4'));
fms.addFieldMap(fmdouble(tmptab,'CA_5'));
fms.addFieldMap(fmdouble(tmptab,'CA_6'));
fms.addFieldMap(fmdouble(tmptab,'MSE_1'));
fms.addFieldMap(fmdouble(tmptab,'MSE_2'));
fms.addFieldMap(fmdouble(tmptab,'MSE_3'));
fms.addFieldMap(fmdouble(tmptab,'MSE_4'));
fms.addFieldMap(fmdouble(tmptab,'MSE_5'));
fms.addFieldMap(fmdouble(tmptab,'MSE_6'));
fms.addFieldMap(fmdouble(tmptab,'NOAA_A'));
fms.addFieldMap(fmdouble(tmptab,'NOAA_B'));
fms.addFieldMap(fmdouble(tmptab,'NOAA_C'));
fms.addFieldMap(fmdouble(tmptab,'NOAA_D'));
fms.addFieldMap(fmdouble(tmptab,'NRCC_A'));
fms.addFieldMap(fmdouble(tmptab,'NRCC_B'));
fms.addFieldMap(fmdouble(tmptab,'NRCC_C'));
fms.addFieldMap(fmdouble(tmptab,'NRCC_D'));
fms.addFieldMap(fmdouble(tmptab,'NV_N'));
fms.addFieldMap(fmdouble(tmptab,'NV_S'));
fms.addFieldMap(fmdouble(tmptab,'NV_W'));
fms.addFieldMap(fmdouble(tmptab,'SCS_I'));
fms.addFieldMap(fmdouble(tmptab,'SCS_IA'));
fms.addFieldMap(fmdouble(tmptab,'SCS_II'));
fms.addFieldMap(fmdouble(tmptab,'SCS_III'));
f.value +=1;

z = tab2tab(tmptab,rez['source'],fc,fms);
f.value +=1;

print("  adding indexes");
z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'time'
    ,index_name = 'time_IDX'
);
f.value +=1;


IntProgress(value=0, max=3)

  downloading file
  loading to table
  adding indexes
Wall time: 2.65 s


### 10.060: Download US Census Tigerweb 2020 State Coverage

In [7]:
%%time

# Note tigerweb will timeout if all state-equivalent records are requested in one go.
# Setting the forcelimit value to 5 records at once works around the problem.

f = IntProgress(min=0,max=1);
display(f);

host = "tigerweb.geo.census.gov";
path = "/arcgis/rest/services/TIGERweb/State_County/MapServer/0";
fc   = "census_states";

if arcpy.Exists(rez['source'] + os.sep + fc):
    arcpy.Delete_management(rez['source'] + os.sep + fc);

z = scrape_ags(host,path,rez['source'],fc,5);
f.value +=1;

z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'GEOID'
    ,index_name = 'GEOID_IDX'
);
f.value +=1;


IntProgress(value=0, max=1)

  pulling records where OBJECTID >= 1 AND OBJECTID <= 5
  pulling records where OBJECTID >= 6 AND OBJECTID <= 10
  pulling records where OBJECTID >= 11 AND OBJECTID <= 15
  pulling records where OBJECTID >= 16 AND OBJECTID <= 20
  pulling records where OBJECTID >= 21 AND OBJECTID <= 25
  pulling records where OBJECTID >= 26 AND OBJECTID <= 30
  pulling records where OBJECTID >= 31 AND OBJECTID <= 35
  pulling records where OBJECTID >= 36 AND OBJECTID <= 40
  pulling records where OBJECTID >= 41 AND OBJECTID <= 45
  pulling records where OBJECTID >= 46 AND OBJECTID <= 50
  pulling records where OBJECTID >= 51 AND OBJECTID <= 55
  pulling records where OBJECTID >= 56 AND OBJECTID <= 56
  Scrape complete.
Wall time: 1min 12s


### 10.080: Review results

In [8]:
%%time

grid = rez['source']  + os.sep + 'CRWU_CREAT_Grid_Projections';
grid_cnt = arcpy.GetCount_management(grid)[0];
hist = rez['source']  + os.sep + 'CRWU_CREAT_Historic_Climate_Stations';
hist_cnt = arcpy.GetCount_management(hist)[0];
coop = rez['source']  + os.sep + 'COOP_STATIONS_TO_USE';
coop_cnt = arcpy.GetCount_management(coop)[0];
isd  = rez['source']  + os.sep + 'ISD_STATIONS_TO_USE';
isd_cnt = arcpy.GetCount_management(isd)[0];
tdf  = rez['source']  + os.sep + 'TEMPORAL_DIST_FILE';
tdf_cnt = arcpy.GetCount_management(tdf)[0];
states  = rez['source'] + os.sep + 'census_states';
states_cnt = arcpy.GetCount_management(states)[0];

print("  Grid Projections : " + str(grid_cnt));
print("  Historic Stations: " + str(hist_cnt));
print("  COOP Stations    : " + str(coop_cnt));
print("  ISD Stations     : " + str(isd_cnt));
print("  Tigerweb States  : " + str(states_cnt));
print(" ");

nw = datetime.datetime.now();
with open(rez['qa'] + os.sep + 'step10qa.txt',"w") as out:
    out.write("Step 10 QA Review\n");
    out.write(datetime.datetime.now().isoformat() + "\n");
    out.write("Grid Projections Loaded: " + str(grid_cnt) + "\n");
    out.write("Historic Stations Loaded: " + str(hist_cnt) + "\n");
    out.write("COOP Stations Loaded: " + str(coop_cnt) + "\n");
    out.write("ISD Stations Loaded: " + str(isd_cnt) + "\n");
    out.write("Temp Dist File Records Loaded: " + str(tdf_cnt) + "\n");
    out.write("Tigerweb States Loaded: " + str(states_cnt) + "\n");


  Grid Projections : 24743
  Historic Stations: 11165
  COOP Stations    : 1851
  ISD Stations     : 3293
  Tigerweb States  : 56
 
Wall time: 1.91 s
